external/libgav1: update to v0.16.3 am: 81461368d4
Original change: https://android-review.googlesource.com/c/platform/external/libgav1/+/1676768
Change-Id: I69cad99291a6817597bcd9e22fdfb62ed183f089
diff --git a/AUTHORS b/AUTHORS
deleted file mode 100644
index d92ea0a..0000000
--- a/AUTHORS
+++ /dev/null
@@ -1,6 +0,0 @@
-# This is the list of libgav1 authors for copyright purposes.
-#
-# This does not necessarily list everyone who has contributed code, since in
-# some cases, their employer may be the copyright holder. To see the full list
-# of contributors, see the revision history in source control.
-Google LLC
diff --git a/Android.bp b/Android.bp
index d3ddd1a..3ea8b18 100644
--- a/Android.bp
+++ b/Android.bp
@@ -50,10 +50,10 @@
"libgav1",
],
- // Note: if optimizations are required for x86 the sse4 files should be
- // split to their own target to receive the correct flagging. All files in
- // the library can be built for any target without producing empty object
- // files.
+ // Note: if optimizations are required for x86 the avx2 & sse4 files should
+ // be split to their own targets to receive the correct flagging. All files
+ // in the library can be built for any target without producing empty
+ // object files.
srcs: [
"libgav1/src/buffer_pool.cc",
"libgav1/src/decoder.cc",
@@ -67,9 +67,10 @@
"libgav1/src/dsp/arm/intra_edge_neon.cc",
"libgav1/src/dsp/arm/intrapred_cfl_neon.cc",
"libgav1/src/dsp/arm/intrapred_directional_neon.cc",
- "libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc",
+ "libgav1/src/dsp/arm/intrapred_filter_neon.cc",
"libgav1/src/dsp/arm/intrapred_neon.cc",
"libgav1/src/dsp/arm/intrapred_smooth_neon.cc",
+ "libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc",
"libgav1/src/dsp/arm/inverse_transform_neon.cc",
"libgav1/src/dsp/arm/loop_filter_neon.cc",
"libgav1/src/dsp/arm/loop_restoration_neon.cc",
@@ -89,6 +90,10 @@
"libgav1/src/dsp/film_grain.cc",
"libgav1/src/dsp/intra_edge.cc",
"libgav1/src/dsp/intrapred.cc",
+ "libgav1/src/dsp/intrapred_cfl.cc",
+ "libgav1/src/dsp/intrapred_directional.cc",
+ "libgav1/src/dsp/intrapred_filter.cc",
+ "libgav1/src/dsp/intrapred_smooth.cc",
"libgav1/src/dsp/inverse_transform.cc",
"libgav1/src/dsp/loop_filter.cc",
"libgav1/src/dsp/loop_restoration.cc",
@@ -100,15 +105,23 @@
"libgav1/src/dsp/warp.cc",
"libgav1/src/dsp/weight_mask.cc",
"libgav1/src/dsp/x86/average_blend_sse4.cc",
+ "libgav1/src/dsp/x86/cdef_avx2.cc",
"libgav1/src/dsp/x86/cdef_sse4.cc",
+ "libgav1/src/dsp/x86/convolve_avx2.cc",
"libgav1/src/dsp/x86/convolve_sse4.cc",
"libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc",
+ "libgav1/src/dsp/x86/film_grain_sse4.cc",
"libgav1/src/dsp/x86/intra_edge_sse4.cc",
"libgav1/src/dsp/x86/intrapred_cfl_sse4.cc",
+ "libgav1/src/dsp/x86/intrapred_directional_sse4.cc",
+ "libgav1/src/dsp/x86/intrapred_filter_sse4.cc",
"libgav1/src/dsp/x86/intrapred_smooth_sse4.cc",
"libgav1/src/dsp/x86/intrapred_sse4.cc",
"libgav1/src/dsp/x86/inverse_transform_sse4.cc",
"libgav1/src/dsp/x86/loop_filter_sse4.cc",
+ "libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc",
+ "libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc",
+ "libgav1/src/dsp/x86/loop_restoration_avx2.cc",
"libgav1/src/dsp/x86/loop_restoration_sse4.cc",
"libgav1/src/dsp/x86/mask_blend_sse4.cc",
"libgav1/src/dsp/x86/motion_field_projection_sse4.cc",
@@ -140,8 +153,8 @@
"libgav1/src/tile/bitstream/partition.cc",
"libgav1/src/tile/bitstream/transform_size.cc",
"libgav1/src/tile/prediction.cc",
- "libgav1/src/tile_scratch_buffer.cc",
"libgav1/src/tile/tile.cc",
+ "libgav1/src/tile_scratch_buffer.cc",
"libgav1/src/utils/bit_reader.cc",
"libgav1/src/utils/block_parameters_holder.cc",
"libgav1/src/utils/constants.cc",
@@ -149,7 +162,6 @@
"libgav1/src/utils/entropy_decoder.cc",
"libgav1/src/utils/executor.cc",
"libgav1/src/utils/logging.cc",
- "libgav1/src/utils/parameter_tree.cc",
"libgav1/src/utils/raw_bit_reader.cc",
"libgav1/src/utils/segmentation.cc",
"libgav1/src/utils/segmentation_map.cc",
diff --git a/README.version b/README.version
index b65b65a..89f9d10 100644
--- a/README.version
+++ b/README.version
@@ -1,5 +1,5 @@
URL: https://chromium.googlesource.com/codecs/libgav1
-Version: v0.16.0
+Version: v0.16.3
BugComponent: 324837
Local Modifications:
None
diff --git a/libgav1/CMakeLists.txt b/libgav1/CMakeLists.txt
index f033bae..5e9e17a 100644
--- a/libgav1/CMakeLists.txt
+++ b/libgav1/CMakeLists.txt
@@ -36,6 +36,26 @@
set(libgav1_examples "${libgav1_root}/examples")
set(libgav1_source "${libgav1_root}/src")
+include("${libgav1_root}/cmake/libgav1_options.cmake")
+
+libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
+ "Enables optimized code." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_AVX2 HELPSTRING "Enables avx2 optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
+ VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
+ "Enables sse4.1 optimizations." VALUE ON)
+libgav1_option(NAME LIBGAV1_ENABLE_TESTS HELPSTRING "Enables tests." VALUE ON)
+libgav1_option(
+ NAME LIBGAV1_VERBOSE HELPSTRING
+ "Enables verbose build system output. Higher numbers are more verbose." VALUE
+ OFF)
+
+if(NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+
include(FindThreads)
include("${libgav1_examples}/libgav1_examples.cmake")
@@ -45,29 +65,14 @@
include("${libgav1_root}/cmake/libgav1_helpers.cmake")
include("${libgav1_root}/cmake/libgav1_install.cmake")
include("${libgav1_root}/cmake/libgav1_intrinsics.cmake")
-include("${libgav1_root}/cmake/libgav1_options.cmake")
include("${libgav1_root}/cmake/libgav1_sanitizer.cmake")
include("${libgav1_root}/cmake/libgav1_targets.cmake")
include("${libgav1_root}/cmake/libgav1_variables.cmake")
+include("${libgav1_root}/tests/libgav1_tests.cmake")
include("${libgav1_source}/dsp/libgav1_dsp.cmake")
include("${libgav1_source}/libgav1_decoder.cmake")
include("${libgav1_source}/utils/libgav1_utils.cmake")
-libgav1_option(NAME LIBGAV1_ENABLE_OPTIMIZATIONS HELPSTRING
- "Enables optimized code." VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_NEON HELPSTRING "Enables neon optimizations."
- VALUE ON)
-libgav1_option(NAME LIBGAV1_ENABLE_SSE4_1 HELPSTRING
- "Enables sse4.1 optimizations." VALUE ON)
-libgav1_option(
- NAME LIBGAV1_VERBOSE HELPSTRING
- "Enables verbose build system output. Higher numbers are more verbose." VALUE
- OFF)
-
-if(NOT CMAKE_BUILD_TYPE)
- set(CMAKE_BUILD_TYPE Release)
-endif()
-
libgav1_optimization_detect()
libgav1_set_build_definitions()
libgav1_set_cxx_flags()
@@ -107,13 +112,27 @@
separate_arguments(LIBGAV1_EXE_LINKER_FLAGS)
endif()
-add_subdirectory("${libgav1_root}/third_party/abseil-cpp"
- "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
+# Set test-only flags based on LIBGAV1_CXX_FLAGS.
+libgav1_set_test_flags()
+
+set(libgav1_abseil "${libgav1_root}/third_party/abseil-cpp")
+if(NOT EXISTS "${libgav1_abseil}")
+ message(
+ FATAL_ERROR
+ "Abseil not found. This dependency is required by the"
+ " examples & tests and libgav1 when LIBGAV1_THREADPOOL_USE_STD_MUTEX is"
+ " not defined. To continue, download the Abseil repository to"
+ " third_party/abseil-cpp:\n git \\\n -C ${libgav1_root} \\\n"
+ " clone \\\n"
+ " https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp")
+endif()
+add_subdirectory("${libgav1_abseil}" "${libgav1_abseil_build}" EXCLUDE_FROM_ALL)
libgav1_reset_target_lists()
libgav1_add_dsp_targets()
libgav1_add_decoder_targets()
libgav1_add_examples_targets()
+libgav1_add_tests_targets()
libgav1_add_utils_targets()
libgav1_setup_install_target()
diff --git a/libgav1/README.md b/libgav1/README.md
index b935679..3155970 100644
--- a/libgav1/README.md
+++ b/libgav1/README.md
@@ -20,7 +20,18 @@
From within the libgav1 directory:
```shell
- $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ $ git clone https://github.com/abseil/abseil-cpp.git third_party/abseil-cpp
+ ```
+
+ Note: Abseil is required by the examples and tests. libgav1 will depend on
+ it if `LIBGAV1_THREADPOOL_USE_STD_MUTEX` is set to `0` (see below).
+
+4. (Optional) [GoogleTest](https://github.com/google/googletest)
+
+ From within the libgav1 directory:
+
+ ```shell
+ $ git clone https://github.com/google/googletest.git third_party/googletest
```
### Compile
@@ -39,10 +50,13 @@
[symbol reduction](#symbol-reduction) in an optimized build to keep all
versions of dsp functions available. Automatically defined in
`src/dsp/dsp.h` if unset.
+* `LIBGAV1_ENABLE_AVX2`: define to a non-zero value to enable avx2
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
* `LIBGAV1_ENABLE_NEON`: define to a non-zero value to enable NEON
- optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset.
* `LIBGAV1_ENABLE_SSE4_1`: define to a non-zero value to enable sse4.1
- optimizations. Automatically defined in `src/dsp/dsp.h` if unset.
+ optimizations. Automatically defined in `src/utils/cpu.h` if unset. Note
+ setting this to 0 will also disable AVX2.
* `LIBGAV1_ENABLE_LOGGING`: define to 0/1 to control debug logging.
Automatically defined in `src/utils/logging.h` if unset.
* `LIBGAV1_EXAMPLES_ENABLE_LOGGING`: define to 0/1 to control error logging in
@@ -55,10 +69,11 @@
* `LIBGAV1_THREADPOOL_USE_STD_MUTEX`: controls use of std::mutex and
absl::Mutex in ThreadPool. Defining this to 1 will remove any Abseil
dependency from the core library. Automatically defined in
- `src/utils/threadpool.h` if unset.
+ `src/utils/threadpool.h` if unset. Defaults to 1 on Android & iOS, 0
+ otherwise.
* `LIBGAV1_MAX_THREADS`: sets the number of threads that the library is
- allowed to create. Has to be an integer > 0. Otherwise this is ignored.
- The default value is 128.
+ allowed to create. Has to be an integer > 0. Otherwise this is ignored. The
+ default value is 128.
* `LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER`: the threshold multiplier that
is used to determine when to use frame parallel decoding. Frame parallel
decoding will be used if |threads| > |tile_count| * this multiplier. Has to
diff --git a/libgav1/cmake/libgav1_build_definitions.cmake b/libgav1/cmake/libgav1_build_definitions.cmake
index 930d8f5..fc83490 100644
--- a/libgav1/cmake/libgav1_build_definitions.cmake
+++ b/libgav1/cmake/libgav1_build_definitions.cmake
@@ -21,7 +21,24 @@
string(TOLOWER "${CMAKE_BUILD_TYPE}" build_type_lowercase)
libgav1_load_version_info()
- set(LIBGAV1_SOVERSION 0)
+
+ # Library version info. See the libtool docs for updating the values:
+ # https://www.gnu.org/software/libtool/manual/libtool.html#Updating-version-info
+ #
+ # c=<current>, r=<revision>, a=<age>
+ #
+ # libtool generates a .so file as .so.[c-a].a.r, while -version-info c:r:a is
+ # passed to libtool.
+ #
+ # We set LIBGAV1_SOVERSION = [c-a].a.r
+ set(LT_CURRENT 0)
+ set(LT_REVISION 0)
+ set(LT_AGE 0)
+ math(EXPR LIBGAV1_SOVERSION_MAJOR "${LT_CURRENT} - ${LT_AGE}")
+ set(LIBGAV1_SOVERSION "${LIBGAV1_SOVERSION_MAJOR}.${LT_AGE}.${LT_REVISION}")
+ unset(LT_CURRENT)
+ unset(LT_REVISION)
+ unset(LT_AGE)
list(APPEND libgav1_include_paths "${libgav1_root}" "${libgav1_root}/src"
"${libgav1_build}" "${libgav1_root}/third_party/abseil-cpp")
@@ -89,9 +106,7 @@
endif()
if(build_type_lowercase MATCHES "rel")
- # TODO(tomfinegan): this value is only a concern for the core library and
- # can be made smaller if the test targets are avoided.
- list(APPEND libgav1_base_cxx_flags "-Wstack-usage=196608")
+ list(APPEND libgav1_base_cxx_flags "-Wframe-larger-than=196608")
endif()
list(APPEND libgav1_msvc_cxx_flags
@@ -144,6 +159,7 @@
# Source file names ending in these suffixes will have the appropriate
# compiler flags added to their compile commands to enable intrinsics.
+ set(libgav1_avx2_source_file_suffix "avx2.cc")
set(libgav1_neon_source_file_suffix "neon.cc")
set(libgav1_sse4_source_file_suffix "sse4.cc")
endmacro()
diff --git a/libgav1/cmake/libgav1_cpu_detection.cmake b/libgav1/cmake/libgav1_cpu_detection.cmake
index 6972d34..e17e27c 100644
--- a/libgav1/cmake/libgav1_cpu_detection.cmake
+++ b/libgav1/cmake/libgav1_cpu_detection.cmake
@@ -24,10 +24,17 @@
if(cpu_lowercase MATCHES "^arm|^aarch64")
set(libgav1_have_neon ON)
elseif(cpu_lowercase MATCHES "^x86|amd64")
+ set(libgav1_have_avx2 ON)
set(libgav1_have_sse4 ON)
endif()
endif()
+ if(libgav1_have_avx2 AND LIBGAV1_ENABLE_AVX2)
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=1")
+ else()
+ list(APPEND libgav1_defines "LIBGAV1_ENABLE_AVX2=0")
+ endif()
+
if(libgav1_have_neon AND LIBGAV1_ENABLE_NEON)
list(APPEND libgav1_defines "LIBGAV1_ENABLE_NEON=1")
else()
diff --git a/libgav1/cmake/libgav1_flags.cmake b/libgav1/cmake/libgav1_flags.cmake
index 0b8df60..a5408e2 100644
--- a/libgav1/cmake/libgav1_flags.cmake
+++ b/libgav1/cmake/libgav1_flags.cmake
@@ -118,6 +118,12 @@
if(cxx_flags)
message("--- Testing flags from $cxx_flags: " "${cxx_flags}")
foreach(cxx_flag ${cxx_flags})
+ # Between 3.17.0 and 3.18.2 check_cxx_compiler_flag() sets a normal
+ # variable at parent scope while check_cxx_source_compiles() continues
+ # to set an internal cache variable, so we unset both to avoid the
+ # failure / success state persisting between checks. See
+ # https://gitlab.kitware.com/cmake/cmake/-/issues/21207.
+ unset(cxx_flag_test_passed)
unset(cxx_flag_test_passed CACHE)
message("--- Testing flag: ${cxx_flag}")
check_cxx_compiler_flag("${cxx_flag}" cxx_flag_test_passed)
@@ -199,7 +205,7 @@
# Restore cached global exe linker flags.
if(cached_CMAKE_EXE_LINKER_FLAGS)
- set(CMAKE_EXE_LINKER_FLAGS cached_CMAKE_EXE_LINKER_FLAGS)
+ set(CMAKE_EXE_LINKER_FLAGS ${cached_CMAKE_EXE_LINKER_FLAGS})
else()
unset(CMAKE_EXE_LINKER_FLAGS)
endif()
@@ -243,3 +249,15 @@
libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES ${cxx_flag_lists})
endmacro()
+
+# Sets LIBGAV1_TEST_C_FLAGS and LIBGAV1_TEST_CXX_FLAGS.
+#
+# Note: libgav1_set_cxx_flags() must be called before this macro. Furthermore,
+# the call to this macro should be made after all additions to LIBGAV1_CXX_FLAGS
+# are complete.
+macro(libgav1_set_test_flags)
+ if(LIBGAV1_ENABLE_TESTS)
+ set(LIBGAV1_TEST_CXX_FLAGS ${LIBGAV1_CXX_FLAGS})
+ list(FILTER LIBGAV1_TEST_CXX_FLAGS EXCLUDE REGEX "-Wframe-larger-than")
+ endif()
+endmacro()
diff --git a/libgav1/cmake/libgav1_helpers.cmake b/libgav1/cmake/libgav1_helpers.cmake
index 76d8d67..ac16257 100644
--- a/libgav1/cmake/libgav1_helpers.cmake
+++ b/libgav1/cmake/libgav1_helpers.cmake
@@ -20,7 +20,13 @@
# Kills build generation using message(FATAL_ERROR) and outputs all data passed
# to the console via use of $ARGN.
macro(libgav1_die)
- message(FATAL_ERROR ${ARGN})
+ # macro parameters are not variables so a temporary is needed to work with
+ # list().
+ set(msg ${ARGN})
+ # message(${ARGN}) will merge all list elements with no separator while
+ # "${ARGN}" will output the list as a ';' delimited string.
+ list(JOIN msg " " msg)
+ message(FATAL_ERROR "${msg}")
endmacro()
# Converts semi-colon delimited list variable(s) to string. Output is written to
@@ -94,10 +100,10 @@
"${dummy_source_dir}/libgav1_${cdsf_TARGET}_${cdsf_BASENAME}.cc")
set(dummy_source_code
"// Generated file. DO NOT EDIT!\n"
- "// C++ source file created for target ${cdsf_TARGET}. \n"
- "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void);\n"
+ "// C++ source file created for target ${cdsf_TARGET}.\n"
+ "void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void)\;\n"
"void libgav1_${cdsf_TARGET}_${cdsf_BASENAME}_dummy_function(void) {}\n")
- file(WRITE "${dummy_source_file}" "${dummy_source_code}")
+ file(WRITE "${dummy_source_file}" ${dummy_source_code})
target_sources(${cdsf_TARGET} PRIVATE ${dummy_source_file})
diff --git a/libgav1/cmake/libgav1_intrinsics.cmake b/libgav1/cmake/libgav1_intrinsics.cmake
index 039ef35..a2e9ddb 100644
--- a/libgav1/cmake/libgav1_intrinsics.cmake
+++ b/libgav1/cmake/libgav1_intrinsics.cmake
@@ -38,6 +38,12 @@
if(NOT MSVC)
set(${intrinsics_VARIABLE} "${LIBGAV1_NEON_INTRINSICS_FLAG}")
endif()
+ elseif(intrinsics_SUFFIX MATCHES "avx2")
+ if(MSVC)
+ set(${intrinsics_VARIABLE} "/arch:AVX2")
+ else()
+ set(${intrinsics_VARIABLE} "-mavx2")
+ endif()
elseif(intrinsics_SUFFIX MATCHES "sse4")
if(NOT MSVC)
set(${intrinsics_VARIABLE} "-msse4.1")
@@ -57,7 +63,7 @@
# necessary: libgav1_process_intrinsics_sources(SOURCES <sources>)
#
# Detects requirement for intrinsics flags using source file name suffix.
-# Currently supports only SSE4.1.
+# Currently supports AVX2 and SSE4.1.
macro(libgav1_process_intrinsics_sources)
unset(arg_TARGET)
unset(arg_SOURCES)
@@ -71,6 +77,25 @@
"SOURCES required.")
endif()
+ if(LIBGAV1_ENABLE_AVX2 AND libgav1_have_avx2)
+ unset(avx2_sources)
+ list(APPEND avx2_sources ${arg_SOURCES})
+
+ list(FILTER avx2_sources INCLUDE REGEX
+ "${libgav1_avx2_source_file_suffix}$")
+
+ if(avx2_sources)
+ unset(avx2_flags)
+ libgav1_get_intrinsics_flag_for_suffix(SUFFIX
+ ${libgav1_avx2_source_file_suffix}
+ VARIABLE avx2_flags)
+ if(avx2_flags)
+ libgav1_set_compiler_flags_for_sources(SOURCES ${avx2_sources} FLAGS
+ ${avx2_flags})
+ endif()
+ endif()
+ endif()
+
if(LIBGAV1_ENABLE_SSE4_1 AND libgav1_have_sse4)
unset(sse4_sources)
list(APPEND sse4_sources ${arg_SOURCES})
diff --git a/libgav1/cmake/libgav1_sanitizer.cmake b/libgav1/cmake/libgav1_sanitizer.cmake
index 4bb2263..2f9ee07 100644
--- a/libgav1/cmake/libgav1_sanitizer.cmake
+++ b/libgav1/cmake/libgav1_sanitizer.cmake
@@ -39,7 +39,9 @@
list(APPEND LIBGAV1_CXX_FLAGS "-fno-omit-frame-pointer"
"-fno-optimize-sibling-calls")
- libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
+ # Check the linker flags first as they may be required in the compile check
+ # to avoid undefined symbols related to the sanitizer.
libgav1_test_exe_linker_flag(FLAG_LIST_VAR_NAME LIBGAV1_EXE_LINKER_FLAGS)
+ libgav1_test_cxx_flag(FLAG_LIST_VAR_NAMES LIBGAV1_CXX_FLAGS FLAG_REQUIRED)
endif()
endmacro()
diff --git a/libgav1/cmake/libgav1_targets.cmake b/libgav1/cmake/libgav1_targets.cmake
index 78b4865..997f8bd 100644
--- a/libgav1/cmake/libgav1_targets.cmake
+++ b/libgav1/cmake/libgav1_targets.cmake
@@ -29,7 +29,7 @@
# Creates an executable target. The target name is passed as a parameter to the
# NAME argument, and the sources passed as a parameter to the SOURCES argument:
-# libgav1_add_test(NAME <name> SOURCES <sources> [optional args])
+# libgav1_add_executable(NAME <name> SOURCES <sources> [optional args])
#
# Optional args:
# cmake-format: off
@@ -115,15 +115,35 @@
target_include_directories(${exe_NAME} PRIVATE ${exe_INCLUDES})
endif()
- if(exe_COMPILE_FLAGS OR LIBGAV1_CXX_FLAGS)
+ unset(exe_LIBGAV1_COMPILE_FLAGS)
+ if(exe_TEST)
+ list(FILTER exe_SOURCES INCLUDE REGEX "\\.c$")
+ list(LENGTH exe_SOURCES exe_SOURCES_length)
+ if(exe_SOURCES_length EQUAL 0)
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_CXX_FLAGS})
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_TEST_C_FLAGS})
+ endif()
+ else()
+ set(exe_LIBGAV1_COMPILE_FLAGS ${LIBGAV1_CXX_FLAGS})
+ endif()
+
+ if(exe_COMPILE_FLAGS OR exe_LIBGAV1_COMPILE_FLAGS)
target_compile_options(${exe_NAME}
- PRIVATE ${exe_COMPILE_FLAGS} ${LIBGAV1_CXX_FLAGS})
+ PRIVATE ${exe_COMPILE_FLAGS}
+ ${exe_LIBGAV1_COMPILE_FLAGS})
endif()
if(exe_LINK_FLAGS OR LIBGAV1_EXE_LINKER_FLAGS)
- set_target_properties(${exe_NAME}
- PROPERTIES LINK_FLAGS ${exe_LINK_FLAGS}
- ${LIBGAV1_EXE_LINKER_FLAGS})
+ list(APPEND exe_LINK_FLAGS "${LIBGAV1_EXE_LINKER_FLAGS}")
+ if(${CMAKE_VERSION} VERSION_LESS "3.13")
+ # LINK_FLAGS is managed as a string.
+ libgav1_set_and_stringify(SOURCE "${exe_LINK_FLAGS}" DEST exe_LINK_FLAGS)
+ set_target_properties(${exe_NAME}
+ PROPERTIES LINK_FLAGS "${exe_LINK_FLAGS}")
+ else()
+ target_link_options(${exe_NAME} PRIVATE ${exe_LINK_FLAGS})
+ endif()
endif()
if(exe_OBJLIB_DEPS)
@@ -137,7 +157,7 @@
endif()
if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
- target_compile_definitions(${lib_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
+ target_compile_definitions(${exe_NAME} PRIVATE "LIBGAV1_BUILDING_DLL=0")
endif()
if(exe_LIB_DEPS)
@@ -321,7 +341,9 @@
endif()
if(lib_TYPE STREQUAL SHARED AND NOT MSVC)
- set_target_properties(${lib_NAME} PROPERTIES SOVERSION ${LIBGAV1_SOVERSION})
+ set_target_properties(${lib_NAME}
+ PROPERTIES VERSION ${LIBGAV1_SOVERSION} SOVERSION
+ ${LIBGAV1_SOVERSION_MAJOR})
endif()
if(BUILD_SHARED_LIBS AND (MSVC OR WIN32))
diff --git a/libgav1/examples/gav1_decode.cc b/libgav1/examples/gav1_decode.cc
index e7d3246..1408e8c 100644
--- a/libgav1/examples/gav1_decode.cc
+++ b/libgav1/examples/gav1_decode.cc
@@ -370,16 +370,15 @@
const libgav1::DecoderBuffer* buffer;
status = decoder.DequeueFrame(&buffer);
- if (status != libgav1::kStatusOk &&
- status != libgav1::kStatusNothingToDequeue) {
- fprintf(stderr, "Unable to dequeue frame: %s\n",
- libgav1::GetErrorString(status));
- return EXIT_FAILURE;
- }
if (status == libgav1::kStatusNothingToDequeue) {
dequeue_finished = true;
continue;
}
+ if (status != libgav1::kStatusOk) {
+ fprintf(stderr, "Unable to dequeue frame: %s\n",
+ libgav1::GetErrorString(status));
+ return EXIT_FAILURE;
+ }
dequeue_finished = false;
if (buffer == nullptr) continue;
++decoded_frames;
@@ -420,6 +419,9 @@
input_buffers.ReleaseInputBuffer(input_buffer);
}
input_buffer = nullptr;
+ // Clear any in progress frames to ensure the output frame limit is
+ // respected.
+ decoder.SignalEOS();
}
} while (input_buffer != nullptr ||
(!file_reader->IsEndOfFile() && !limit_reached) ||
diff --git a/libgav1/examples/logging.h b/libgav1/examples/logging.h
index c0bcad7..cf5a09f 100644
--- a/libgav1/examples/logging.h
+++ b/libgav1/examples/logging.h
@@ -46,7 +46,7 @@
#define LIBGAV1_EXAMPLES_LOG_ERROR(error_string) \
do { \
constexpr const char* libgav1_examples_basename = \
- ::libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ libgav1::examples::Basename(__FILE__, sizeof(__FILE__) - 1); \
fprintf(stderr, "%s:%d (%s): %s.\n", libgav1_examples_basename, __LINE__, \
__func__, error_string); \
} while (false)
diff --git a/libgav1/src/decoder_impl.cc b/libgav1/src/decoder_impl.cc
index e40c692..e23903c 100644
--- a/libgav1/src/decoder_impl.cc
+++ b/libgav1/src/decoder_impl.cc
@@ -31,13 +31,11 @@
#include "src/obu_parser.h"
#include "src/post_filter.h"
#include "src/prediction_mask.h"
-#include "src/quantizer.h"
#include "src/threading_strategy.h"
#include "src/utils/blocking_counter.h"
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/raw_bit_reader.h"
#include "src/utils/segmentation.h"
#include "src/utils/threadpool.h"
@@ -632,10 +630,6 @@
}
StatusCode DecoderImpl::Init() {
- if (!GenerateWedgeMask(&wedge_masks_)) {
- LIBGAV1_DLOG(ERROR, "GenerateWedgeMask() failed.");
- return kStatusOutOfMemory;
- }
if (!output_frame_queue_.Init(kMaxLayers)) {
LIBGAV1_DLOG(ERROR, "output_frame_queue_.Init() failed.");
return kStatusOutOfMemory;
@@ -854,6 +848,14 @@
LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
return status;
}
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
if (IsNewSequenceHeader(*obu)) {
const ObuSequenceHeader& sequence_header = obu->sequence_header();
const Libgav1ImageFormat image_format =
@@ -1043,6 +1045,14 @@
LIBGAV1_DLOG(ERROR, "Failed to parse OBU.");
return status;
}
+ if (!MaybeInitializeQuantizerMatrix(obu->frame_header())) {
+ LIBGAV1_DLOG(ERROR, "InitializeQuantizerMatrix() failed.");
+ return kStatusOutOfMemory;
+ }
+ if (!MaybeInitializeWedgeMasks(obu->frame_header().frame_type)) {
+ LIBGAV1_DLOG(ERROR, "InitializeWedgeMasks() failed.");
+ return kStatusOutOfMemory;
+ }
if (IsNewSequenceHeader(*obu)) {
const ObuSequenceHeader& sequence_header = obu->sequence_header();
const Libgav1ImageFormat image_format =
@@ -1145,7 +1155,7 @@
buffer_.bitdepth = yuv_buffer->bitdepth();
const int num_planes =
yuv_buffer->is_monochrome() ? kMaxPlanesMonochrome : kMaxPlanes;
- int plane = 0;
+ int plane = kPlaneY;
for (; plane < num_planes; ++plane) {
buffer_.stride[plane] = yuv_buffer->stride(plane);
buffer_.plane[plane] = yuv_buffer->data(plane);
@@ -1188,6 +1198,12 @@
"Failed to allocate memory for loop restoration info units.");
return kStatusOutOfMemory;
}
+ ThreadingStrategy& threading_strategy =
+ frame_scratch_buffer->threading_strategy;
+ if (!is_frame_parallel_ &&
+ !threading_strategy.Reset(frame_header, settings_.threads)) {
+ return kStatusOutOfMemory;
+ }
const bool do_cdef =
PostFilter::DoCdef(frame_header, settings_.post_filter_mask);
const int num_planes = sequence_header.color_config.is_monochrome
@@ -1198,15 +1214,11 @@
const bool do_superres =
PostFilter::DoSuperRes(frame_header, settings_.post_filter_mask);
// Use kBorderPixels for the left, right, and top borders. Only the bottom
- // border may need to be bigger. SuperRes border is needed only if we are
- // applying SuperRes in-place which is being done only in single threaded
- // mode.
+ // border may need to be bigger. Cdef border is needed only if we apply Cdef
+ // without multithreading.
const int bottom_border = GetBottomBorderPixels(
- do_cdef, do_restoration,
- do_superres &&
- frame_scratch_buffer->threading_strategy.post_filter_thread_pool() ==
- nullptr,
- sequence_header.color_config.subsampling_y);
+ do_cdef && threading_strategy.post_filter_thread_pool() == nullptr,
+ do_restoration, do_superres, sequence_header.color_config.subsampling_y);
current_frame->set_chroma_sample_position(
sequence_header.color_config.chroma_sample_position);
if (!current_frame->Realloc(sequence_header.color_config.bitdepth,
@@ -1269,8 +1281,7 @@
// without having to check for boundary conditions.
if (!frame_scratch_buffer->block_parameters_holder.Reset(
frame_header.rows4x4 + kMaxBlockHeight4x4,
- frame_header.columns4x4 + kMaxBlockWidth4x4,
- sequence_header.use_128x128_superblock)) {
+ frame_header.columns4x4 + kMaxBlockWidth4x4)) {
return kStatusOutOfMemory;
}
const dsp::Dsp* const dsp =
@@ -1288,12 +1299,6 @@
LIBGAV1_DLOG(ERROR, "tiles.reserve(%d) failed.\n", tile_count);
return kStatusOutOfMemory;
}
- ThreadingStrategy& threading_strategy =
- frame_scratch_buffer->threading_strategy;
- if (!is_frame_parallel_ &&
- !threading_strategy.Reset(frame_header, settings_.threads)) {
- return kStatusOutOfMemory;
- }
if (threading_strategy.row_thread_pool(0) != nullptr || is_frame_parallel_) {
if (frame_scratch_buffer->residual_buffer_pool == nullptr) {
@@ -1318,43 +1323,36 @@
}
}
- if (threading_strategy.post_filter_thread_pool() != nullptr &&
- (do_cdef || do_restoration)) {
- const int window_buffer_width = PostFilter::GetWindowBufferWidth(
- threading_strategy.post_filter_thread_pool(), frame_header);
- size_t threaded_window_buffer_size =
- window_buffer_width *
- PostFilter::GetWindowBufferHeight(
- threading_strategy.post_filter_thread_pool(), frame_header) *
- (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
- : sizeof(uint16_t));
- if (do_cdef) {
- // TODO(chengchen): for cdef U, V planes, if there's subsampling, we can
- // use smaller buffer.
- threaded_window_buffer_size *= num_planes;
- }
- // To avoid false sharing, PostFilter's window width in bytes should be a
- // multiple of the cache line size. For simplicity, we check the window
- // width in pixels.
- assert(window_buffer_width % kCacheLineSize == 0);
- if (!frame_scratch_buffer->threaded_window_buffer.Resize(
- threaded_window_buffer_size)) {
- LIBGAV1_DLOG(ERROR,
- "Failed to resize threaded loop restoration buffer.\n");
+ if (threading_strategy.post_filter_thread_pool() != nullptr && do_cdef) {
+ // We need to store 4 rows per 64x64 unit.
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->cdef_border.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_units,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
+ kBorderPixels, nullptr, nullptr, nullptr)) {
return kStatusOutOfMemory;
}
}
- if (do_cdef && do_restoration) {
+ if (do_restoration &&
+ (do_cdef || threading_strategy.post_filter_thread_pool() != nullptr)) {
// We need to store 4 rows per 64x64 unit.
- const int num_deblock_units = MultiplyBy4(Ceil(frame_header.rows4x4, 16));
+ const int num_units =
+ MultiplyBy4(RightShiftWithCeiling(frame_header.rows4x4, 4));
// subsampling_y is set to zero irrespective of the actual frame's
- // subsampling since we need to store exactly |num_deblock_units| rows of
- // the deblocked pixels.
- if (!frame_scratch_buffer->deblock_buffer.Realloc(
+ // subsampling since we need to store exactly |num_units| rows of the loop
+ // restoration border pixels.
+ if (!frame_scratch_buffer->loop_restoration_border.Realloc(
sequence_header.color_config.bitdepth,
sequence_header.color_config.is_monochrome,
- frame_header.upscaled_width, num_deblock_units,
+ frame_header.upscaled_width, num_units,
sequence_header.color_config.subsampling_x,
/*subsampling_y=*/0, kBorderPixels, kBorderPixels, kBorderPixels,
kBorderPixels, nullptr, nullptr, nullptr)) {
@@ -1363,18 +1361,45 @@
}
if (do_superres) {
+ const int pixel_size = sequence_header.color_config.bitdepth == 8
+ ? sizeof(uint8_t)
+ : sizeof(uint16_t);
+ if (!frame_scratch_buffer->superres_coefficients[kPlaneTypeY].Resize(
+ kSuperResFilterTaps * Align(frame_header.upscaled_width, 16) *
+ pixel_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeY].");
+ return kStatusOutOfMemory;
+ }
+ if (!sequence_header.color_config.is_monochrome &&
+ sequence_header.color_config.subsampling_x != 0 &&
+ !frame_scratch_buffer->superres_coefficients[kPlaneTypeUV].Resize(
+ kSuperResFilterTaps *
+ Align(SubsampledValue(frame_header.upscaled_width, 1), 16) *
+ pixel_size)) {
+ LIBGAV1_DLOG(ERROR,
+ "Failed to Resize superres_coefficients[kPlaneTypeUV].");
+ return kStatusOutOfMemory;
+ }
+ }
+
+ if (do_superres && threading_strategy.post_filter_thread_pool() != nullptr) {
const int num_threads =
- 1 + ((threading_strategy.post_filter_thread_pool() == nullptr)
- ? 0
- : threading_strategy.post_filter_thread_pool()->num_threads());
- const size_t superres_line_buffer_size =
- num_threads *
- (MultiplyBy4(frame_header.columns4x4) +
- MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
- (sequence_header.color_config.bitdepth == 8 ? sizeof(uint8_t)
- : sizeof(uint16_t));
- if (!frame_scratch_buffer->superres_line_buffer.Resize(
- superres_line_buffer_size)) {
+ threading_strategy.post_filter_thread_pool()->num_threads() + 1;
+ // subsampling_y is set to zero irrespective of the actual frame's
+ // subsampling since we need to store exactly |num_threads| rows of the
+ // down-scaled pixels.
+ // Left and right borders are for line extension. They are doubled for the Y
+ // plane to make sure the U and V planes have enough space after possible
+ // subsampling.
+ if (!frame_scratch_buffer->superres_line_buffer.Realloc(
+ sequence_header.color_config.bitdepth,
+ sequence_header.color_config.is_monochrome,
+ MultiplyBy4(frame_header.columns4x4), num_threads,
+ sequence_header.color_config.subsampling_x,
+ /*subsampling_y=*/0, 2 * kSuperResHorizontalBorder,
+ 2 * (kSuperResHorizontalBorder + kSuperResHorizontalPadding), 0, 0,
+ nullptr, nullptr, nullptr)) {
LIBGAV1_DLOG(ERROR, "Failed to resize superres line buffer.\n");
return kStatusOutOfMemory;
}
@@ -1384,14 +1409,11 @@
current_frame->buffer(), dsp,
settings_.post_filter_mask);
- if (is_frame_parallel_) {
+ if (is_frame_parallel_ && !IsIntraFrame(frame_header.frame_type)) {
// We can parse the current frame if all the reference frames have been
// parsed.
- for (int i = 0; i < kNumReferenceFrameTypes; ++i) {
- if (!state.reference_valid[i] || state.reference_frame[i] == nullptr) {
- continue;
- }
- if (!state.reference_frame[i]->WaitUntilParsed()) {
+ for (const int index : frame_header.reference_frame_index) {
+ if (!state.reference_frame[index]->WaitUntilParsed()) {
return kStatusUnknownError;
}
}
@@ -1434,7 +1456,7 @@
}
IntraPredictionBuffer* const intra_prediction_buffers =
frame_scratch_buffer->intra_prediction_buffers.get();
- for (int plane = 0; plane < num_planes; ++plane) {
+ for (int plane = kPlaneY; plane < num_planes; ++plane) {
const int subsampling =
(plane == kPlaneY) ? 0 : sequence_header.color_config.subsampling_x;
const size_t intra_prediction_buffer_size =
@@ -1462,9 +1484,9 @@
tile_number, tile_buffers[tile_number].data,
tile_buffers[tile_number].size, sequence_header, frame_header,
current_frame, state, frame_scratch_buffer, wedge_masks_,
- &saved_symbol_decoder_context, prev_segment_ids, &post_filter, dsp,
- threading_strategy.row_thread_pool(tile_number), &pending_tiles,
- is_frame_parallel_, use_intra_prediction_buffer);
+ quantizer_matrix_, &saved_symbol_decoder_context, prev_segment_ids,
+ &post_filter, dsp, threading_strategy.row_thread_pool(tile_number),
+ &pending_tiles, is_frame_parallel_, use_intra_prediction_buffer);
if (tile == nullptr) {
LIBGAV1_DLOG(ERROR, "Failed to create tile.");
return kStatusOutOfMemory;
@@ -1626,4 +1648,27 @@
return sequence_header_changed;
}
+bool DecoderImpl::MaybeInitializeWedgeMasks(FrameType frame_type) {
+ if (IsIntraFrame(frame_type) || wedge_masks_initialized_) {
+ return true;
+ }
+ if (!GenerateWedgeMask(&wedge_masks_)) {
+ return false;
+ }
+ wedge_masks_initialized_ = true;
+ return true;
+}
+
+bool DecoderImpl::MaybeInitializeQuantizerMatrix(
+ const ObuFrameHeader& frame_header) {
+ if (quantizer_matrix_initialized_ || !frame_header.quantizer.use_matrix) {
+ return true;
+ }
+ if (!InitializeQuantizerMatrix(&quantizer_matrix_)) {
+ return false;
+ }
+ quantizer_matrix_initialized_ = true;
+ return true;
+}
+
} // namespace libgav1
diff --git a/libgav1/src/decoder_impl.h b/libgav1/src/decoder_impl.h
index df1b091..b52ecdf 100644
--- a/libgav1/src/decoder_impl.h
+++ b/libgav1/src/decoder_impl.h
@@ -32,6 +32,7 @@
#include "src/gav1/decoder_settings.h"
#include "src/gav1/status_code.h"
#include "src/obu_parser.h"
+#include "src/quantizer.h"
#include "src/residual_buffer_pool.h"
#include "src/symbol_decoder_context.h"
#include "src/tile.h"
@@ -57,7 +58,7 @@
temporal_unit(nullptr),
frame(frame),
position_in_temporal_unit(position_in_temporal_unit) {
- obu->MoveTileBuffer(&tile_buffers);
+ obu->MoveTileBuffers(&tile_buffers);
frame->MarkFrameAsStarted();
}
@@ -210,6 +211,14 @@
return failure_status_ != kStatusOk;
}
+ // Initializes the |quantizer_matrix_| if necessary and sets
+ // |quantizer_matrix_initialized_| to true.
+ bool MaybeInitializeQuantizerMatrix(const ObuFrameHeader& frame_header);
+
+ // Allocates and generates the |wedge_masks_| if necessary and sets
+ // |wedge_masks_initialized_| to true.
+ bool MaybeInitializeWedgeMasks(FrameType frame_type);
+
// Elements in this queue cannot be moved with std::move since the
// |EncodedFrame.temporal_unit| stores a pointer to elements in this queue.
Queue<TemporalUnit> temporal_units_;
@@ -228,6 +237,9 @@
BufferPool buffer_pool_;
WedgeMaskArray wedge_masks_;
+ bool wedge_masks_initialized_ = false;
+ QuantizerMatrix quantizer_matrix_;
+ bool quantizer_matrix_initialized_ = false;
FrameScratchBufferPool frame_scratch_buffer_pool_;
// Used to synchronize the accesses into |temporal_units_| in order to update
diff --git a/libgav1/src/decoder_state.h b/libgav1/src/decoder_state.h
index 897c99f..ea5c792 100644
--- a/libgav1/src/decoder_state.h
+++ b/libgav1/src/decoder_state.h
@@ -33,7 +33,6 @@
for (int ref_index = 0, mask = refresh_frame_flags; mask != 0;
++ref_index, mask >>= 1) {
if ((mask & 1) != 0) {
- reference_valid[ref_index] = true;
reference_frame_id[ref_index] = current_frame_id;
reference_frame[ref_index] = current_frame;
reference_order_hint[ref_index] = order_hint;
@@ -43,7 +42,6 @@
// Clears all the reference frames.
void ClearReferenceFrames() {
- reference_valid = {};
reference_frame_id = {};
reference_order_hint = {};
for (int ref_index = 0; ref_index < kNumReferenceFrameTypes; ++ref_index) {
@@ -51,12 +49,11 @@
}
}
- // reference_valid and reference_frame_id are used only if
- // sequence_header_.frame_id_numbers_present is true.
- // The reference_valid array is indexed by a reference picture slot number.
- // A value (boolean) in the array signifies whether the corresponding
- // reference picture slot is valid for use as a reference picture.
- std::array<bool, kNumReferenceFrameTypes> reference_valid = {};
+ // reference_frame_id and current_frame_id have meaningful values and are used
+ // in checks only if sequence_header_.frame_id_numbers_present is true. If
+ // sequence_header_.frame_id_numbers_present is false, reference_frame_id and
+ // current_frame_id are assigned the default value 0 and are not used in
+ // checks.
std::array<uint16_t, kNumReferenceFrameTypes> reference_frame_id = {};
// A valid value of current_frame_id is an unsigned integer of at most 16
// bits. -1 indicates current_frame_id is not initialized.
@@ -81,6 +78,11 @@
// * |true| indicates that the reference frame is a backwards reference.
// Note: reference_frame_sign_bias[0] (for kReferenceFrameIntra) is not used.
std::array<bool, kNumReferenceFrameTypes> reference_frame_sign_bias = {};
+ // The RefValid[i] variable in the spec does not need to be stored explicitly.
+ // If the RefValid[i] variable in the spec is 0, then reference_frame[i] is a
+ // null pointer. (Whenever the spec sets the RefValid[i] variable to 0, we set
+ // reference_frame[i] to a null pointer.) If the RefValid[i] variable in the
+ // spec is 1, then reference_frame[i] contains a frame buffer pointer.
std::array<RefCountedBufferPtr, kNumReferenceFrameTypes> reference_frame;
};
diff --git a/libgav1/src/dsp/arm/average_blend_neon.cc b/libgav1/src/dsp/arm/average_blend_neon.cc
index d946d70..5b4c094 100644
--- a/libgav1/src/dsp/arm/average_blend_neon.cc
+++ b/libgav1/src/dsp/arm/average_blend_neon.cc
@@ -35,6 +35,11 @@
constexpr int kInterPostRoundBit =
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
inline uint8x8_t AverageBlend8Row(const int16_t* prediction_0,
const int16_t* prediction_1) {
const int16x8_t pred0 = vld1q_s16(prediction_0);
@@ -46,19 +51,24 @@
inline void AverageBlendLargeRow(const int16_t* prediction_0,
const int16_t* prediction_1, const int width,
uint8_t* dest) {
- int x = 0;
+ int x = width;
do {
- const int16x8_t pred_00 = vld1q_s16(&prediction_0[x]);
- const int16x8_t pred_01 = vld1q_s16(&prediction_1[x]);
+ const int16x8_t pred_00 = vld1q_s16(prediction_0);
+ const int16x8_t pred_01 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
const int16x8_t res0 = vaddq_s16(pred_00, pred_01);
const uint8x8_t res_out0 = vqrshrun_n_s16(res0, kInterPostRoundBit + 1);
- const int16x8_t pred_10 = vld1q_s16(&prediction_0[x + 8]);
- const int16x8_t pred_11 = vld1q_s16(&prediction_1[x + 8]);
+ const int16x8_t pred_10 = vld1q_s16(prediction_0);
+ const int16x8_t pred_11 = vld1q_s16(prediction_1);
+ prediction_0 += 8;
+ prediction_1 += 8;
const int16x8_t res1 = vaddq_s16(pred_10, pred_11);
const uint8x8_t res_out1 = vqrshrun_n_s16(res1, kInterPostRoundBit + 1);
- vst1q_u8(dest + x, vcombine_u8(res_out0, res_out1));
- x += 16;
- } while (x < width);
+ vst1q_u8(dest, vcombine_u8(res_out0, res_out1));
+ dest += 16;
+ x -= 16;
+ } while (x != 0);
}
void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
@@ -123,13 +133,139 @@
}
} // namespace
+} // namespace low_bitdepth
-void AverageBlendInit_NEON() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x8_t AverageBlend8Row(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ const uint16x8_t pred0 = vld1q_u16(prediction_0);
+ const uint16x8_t pred1 = vld1q_u16(prediction_1);
+ const uint32x4_t pred_lo =
+ vaddl_u16(vget_low_u16(pred0), vget_low_u16(pred1));
+ const uint32x4_t pred_hi =
+ vaddl_u16(vget_high_u16(pred0), vget_high_u16(pred1));
+ const int32x4_t offset_lo =
+ vsubq_s32(vreinterpretq_s32_u32(pred_lo), compound_offset);
+ const int32x4_t offset_hi =
+ vsubq_s32(vreinterpretq_s32_u32(pred_hi), compound_offset);
+ const uint16x4_t res_lo = vqrshrun_n_s32(offset_lo, kInterPostRoundBit + 1);
+ const uint16x4_t res_hi = vqrshrun_n_s32(offset_hi, kInterPostRoundBit + 1);
+ return vminq_u16(vcombine_u16(res_lo, res_hi), v_bitdepth);
+}
+
+inline void AverageBlendLargeRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1, const int width,
+ uint16_t* dest,
+ const int32x4_t compound_offset,
+ const uint16x8_t v_bitdepth) {
+ int x = width;
+ do {
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ vst1q_u16(dest, AverageBlend8Row(prediction_0, prediction_1,
+ compound_offset, v_bitdepth));
+ prediction_0 += 8;
+ prediction_1 += 8;
+ dest += 8;
+
+ x -= 16;
+ } while (x != 0);
+}
+
+void AverageBlend_NEON(const void* prediction_0, const void* prediction_1,
+ const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = height;
+
+ const ptrdiff_t dst_stride = dest_stride >> 1;
+ const int32x4_t compound_offset =
+ vdupq_n_s32(static_cast<int32_t>(kCompoundOffset + kCompoundOffset));
+ const uint16x8_t v_bitdepth = vdupq_n_u16((1 << kBitdepth10) - 1);
+ if (width == 4) {
+ do {
+ const uint16x8_t result =
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth);
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1_u16(dst, vget_low_u16(result));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(result));
+ dst += dst_stride;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ if (width == 8) {
+ do {
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ vst1q_u16(dst,
+ AverageBlend8Row(pred_0, pred_1, compound_offset, v_bitdepth));
+ dst += dst_stride;
+ pred_0 += 8;
+ pred_1 += 8;
+
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+
+ do {
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ AverageBlendLargeRow(pred_0, pred_1, width, dst, compound_offset,
+ v_bitdepth);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+
+ y -= 2;
+ } while (y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->average_blend = AverageBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/cdef_neon.cc b/libgav1/src/dsp/arm/cdef_neon.cc
index 968b0ff..60c72d6 100644
--- a/libgav1/src/dsp/arm/cdef_neon.cc
+++ b/libgav1/src/dsp/arm/cdef_neon.cc
@@ -265,7 +265,7 @@
// 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
// 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
// 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
- partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), partial_lo[2], 0);
+ partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
@@ -285,9 +285,8 @@
// 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
// 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
// 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
- const uint8x8_t v_zero = vdup_n_u8(0);
- partial_lo[6] = vaddl_u8(v_zero, v_src[0]);
- for (int i = 1; i < 8; ++i) {
+ partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
+ for (int i = 2; i < 8; ++i) {
partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
}
@@ -360,7 +359,7 @@
}
void CdefDirection_NEON(const void* const source, ptrdiff_t stride,
- int* const direction, int* const variance) {
+ uint8_t* const direction, int* const variance) {
assert(direction != nullptr);
assert(variance != nullptr);
const auto* src = static_cast<const uint8_t*>(source);
@@ -451,7 +450,7 @@
int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
const uint16x8_t threshold, const int16x8_t damping) {
- // If reference > pixel, the difference will be negative, so covert to 0 or
+ // If reference > pixel, the difference will be negative, so convert to 0 or
// -1.
const uint16x8_t sign = vcgtq_u16(reference, pixel);
const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
@@ -686,7 +685,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/common_neon.h b/libgav1/src/dsp/arm/common_neon.h
index e8367ab..05e0d05 100644
--- a/libgav1/src/dsp/arm/common_neon.h
+++ b/libgav1/src/dsp/arm/common_neon.h
@@ -28,8 +28,7 @@
#if 0
#include <cstdio>
-
-#include "absl/strings/str_cat.h"
+#include <string>
constexpr bool kEnablePrintRegs = true;
@@ -86,11 +85,11 @@
inline void PrintReg(const int32x4x2_t val, const std::string& name) {
DebugRegisterQ r;
- vst1q_u32(r.u32, val.val[0]);
- const std::string name0 = absl::StrCat(name, ".val[0]").c_str();
+ vst1q_s32(r.i32, val.val[0]);
+ const std::string name0 = name + std::string(".val[0]");
PrintVectQ(r, name0.c_str(), 32);
- vst1q_u32(r.u32, val.val[1]);
- const std::string name1 = absl::StrCat(name, ".val[1]").c_str();
+ vst1q_s32(r.i32, val.val[1]);
+ const std::string name1 = name + std::string(".val[1]");
PrintVectQ(r, name1.c_str(), 32);
}
@@ -169,14 +168,14 @@
// Print an individual (non-vector) value in decimal format.
inline void PrintReg(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %d\n", name, x);
+ fprintf(stderr, "%s: %d\n", name, x);
}
}
// Print an individual (non-vector) value in hexadecimal format.
inline void PrintHex(const int x, const char* name) {
if (kEnablePrintRegs) {
- printf("%s: %x\n", name, x);
+ fprintf(stderr, "%s: %x\n", name, x);
}
}
@@ -277,22 +276,32 @@
ValueToMem<uint32_t>(buf, vget_lane_u32(vreinterpret_u32_u16(val), lane));
}
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store4(void* const buf, const uint16x4_t val) {
+ vst1_u16(static_cast<uint16_t*>(buf), val);
+}
+
+// Simplify code when caller has |buf| cast as uint8_t*.
+inline void Store8(void* const buf, const uint16x8_t val) {
+ vst1q_u16(static_cast<uint16_t*>(buf), val);
+}
+
//------------------------------------------------------------------------------
// Bit manipulation.
// vshXX_n_XX() requires an immediate.
template <int shift>
-inline uint8x8_t LeftShift(const uint8x8_t vector) {
+inline uint8x8_t LeftShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline uint8x8_t RightShift(const uint8x8_t vector) {
+inline uint8x8_t RightShiftVector(const uint8x8_t vector) {
return vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(vector), shift));
}
template <int shift>
-inline int8x8_t RightShift(const int8x8_t vector) {
+inline int8x8_t RightShiftVector(const int8x8_t vector) {
return vreinterpret_s8_u64(vshr_n_u64(vreinterpret_u64_s8(vector), shift));
}
@@ -387,6 +396,15 @@
#endif // defined(__aarch64__)
}
+inline uint32_t SumVector(const uint32x2_t a) {
+#if defined(__aarch64__)
+ return vaddv_u32(a);
+#else
+ const uint64x1_t b = vpaddl_u32(a);
+ return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif // defined(__aarch64__)
+}
+
inline uint32_t SumVector(const uint32x4_t a) {
#if defined(__aarch64__)
return vaddvq_u32(a);
@@ -447,6 +465,36 @@
}
// Input:
+// 00 01 02 03
+// 10 11 12 13
+// 20 21 22 23
+// 30 31 32 33
+inline void Transpose4x4(uint16x4_t a[4]) {
+ // b:
+ // 00 10 02 12
+ // 01 11 03 13
+ const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+ // c:
+ // 20 30 22 32
+ // 21 31 23 33
+ const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+ // d:
+ // 00 10 20 30
+ // 02 12 22 32
+ const uint32x2x2_t d =
+ vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+ // e:
+ // 01 11 21 31
+ // 03 13 23 33
+ const uint32x2x2_t e =
+ vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+ a[0] = vreinterpret_u16_u32(d.val[0]);
+ a[1] = vreinterpret_u16_u32(e.val[0]);
+ a[2] = vreinterpret_u16_u32(d.val[1]);
+ a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+// Input:
// a: 00 01 02 03 10 11 12 13
// b: 20 21 22 23 30 31 32 33
// Output:
@@ -587,6 +635,28 @@
a[7] = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
}
+inline void Transpose8x8(uint8x8_t in[8], uint8x16_t out[4]) {
+ const uint8x16x2_t a0 =
+ vtrnq_u8(vcombine_u8(in[0], in[4]), vcombine_u8(in[1], in[5]));
+ const uint8x16x2_t a1 =
+ vtrnq_u8(vcombine_u8(in[2], in[6]), vcombine_u8(in[3], in[7]));
+
+ const uint16x8x2_t b0 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[0]),
+ vreinterpretq_u16_u8(a1.val[0]));
+ const uint16x8x2_t b1 = vtrnq_u16(vreinterpretq_u16_u8(a0.val[1]),
+ vreinterpretq_u16_u8(a1.val[1]));
+
+ const uint32x4x2_t c0 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vuzpq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+
+ out[0] = vreinterpretq_u8_u32(c0.val[0]);
+ out[1] = vreinterpretq_u8_u32(c1.val[0]);
+ out[2] = vreinterpretq_u8_u32(c0.val[1]);
+ out[3] = vreinterpretq_u8_u32(c1.val[1]);
+}
+
// Input:
// a[0]: 00 01 02 03 04 05 06 07
// a[1]: 10 11 12 13 14 15 16 17
@@ -667,6 +737,83 @@
a[7] = d3.val[1];
}
+// Input:
+// a[0]: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
+// a[1]: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
+// a[2]: 20 21 22 23 24 25 26 27 a0 a1 a2 a3 a4 a5 a6 a7
+// a[3]: 30 31 32 33 34 35 36 37 b0 b1 b2 b3 b4 b5 b6 b7
+// a[4]: 40 41 42 43 44 45 46 47 c0 c1 c2 c3 c4 c5 c6 c7
+// a[5]: 50 51 52 53 54 55 56 57 d0 d1 d2 d3 d4 d5 d6 d7
+// a[6]: 60 61 62 63 64 65 66 67 e0 e1 e2 e3 e4 e5 e6 e7
+// a[7]: 70 71 72 73 74 75 76 77 f0 f1 f2 f3 f4 f5 f6 f7
+
+// Output:
+// a[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+// a[1]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+// a[2]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+// a[3]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+// a[4]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+// a[5]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+// a[6]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+// a[7]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+inline void Transpose8x16(uint8x16_t a[8]) {
+ // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
+ // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
+ // b1.val[0]: 20 30 22 32 24 34 26 36 a0 b0 a2 b2 a4 b4 a6 b6
+ // b1.val[1]: 21 31 23 33 25 35 27 37 a1 b1 a3 b3 a5 b5 a7 b7
+ // b2.val[0]: 40 50 42 52 44 54 46 56 c0 d0 c2 d2 c4 d4 c6 d6
+ // b2.val[1]: 41 51 43 53 45 55 47 57 c1 d1 c3 d3 c5 d5 c7 d7
+ // b3.val[0]: 60 70 62 72 64 74 66 76 e0 f0 e2 f2 e4 f4 e6 f6
+ // b3.val[1]: 61 71 63 73 65 75 67 77 e1 f1 e3 f3 e5 f5 e7 f7
+ const uint8x16x2_t b0 = vtrnq_u8(a[0], a[1]);
+ const uint8x16x2_t b1 = vtrnq_u8(a[2], a[3]);
+ const uint8x16x2_t b2 = vtrnq_u8(a[4], a[5]);
+ const uint8x16x2_t b3 = vtrnq_u8(a[6], a[7]);
+
+ // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 a0 b0 84 94 a4 b4
+ // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 a2 b2 86 96 a6 b6
+ // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 a1 b1 85 95 a5 b5
+ // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 a3 b3 87 97 a7 b7
+ // c2.val[0]: 40 50 60 70 44 54 64 74 c0 d0 e0 f0 c4 d4 e4 f4
+ // c2.val[1]: 42 52 62 72 46 56 66 76 c2 d2 e2 f2 c6 d6 e6 f6
+ // c3.val[0]: 41 51 61 71 45 55 65 75 c1 d1 e1 f1 c5 d5 e5 f5
+ // c3.val[1]: 43 53 63 73 47 57 67 77 c3 d3 e3 f3 c7 d7 e7 f7
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+ const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
+ vreinterpretq_u16_u8(b3.val[0]));
+ const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
+ vreinterpretq_u16_u8(b3.val[1]));
+
+ // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+ // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 a4 b4 c4 d4 e4 f4
+ // d1.val[0]: 01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+ // d1.val[1]: 05 15 25 35 45 55 65 75 85 95 a5 b5 c5 d5 e5 f5
+ // d2.val[0]: 02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+ // d2.val[1]: 06 16 26 36 46 56 66 76 86 96 a6 b6 c6 d6 e6 f6
+ // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+ // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 a7 b7 c7 d7 e7 f7
+ const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c2.val[0]));
+ const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
+ vreinterpretq_u32_u16(c3.val[0]));
+ const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c2.val[1]));
+ const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
+ vreinterpretq_u32_u16(c3.val[1]));
+
+ a[0] = vreinterpretq_u8_u32(d0.val[0]);
+ a[1] = vreinterpretq_u8_u32(d1.val[0]);
+ a[2] = vreinterpretq_u8_u32(d2.val[0]);
+ a[3] = vreinterpretq_u8_u32(d3.val[0]);
+ a[4] = vreinterpretq_u8_u32(d0.val[1]);
+ a[5] = vreinterpretq_u8_u32(d1.val[1]);
+ a[6] = vreinterpretq_u8_u32(d2.val[1]);
+ a[7] = vreinterpretq_u8_u32(d3.val[1]);
+}
+
inline int16x8_t ZeroExtend(const uint8x8_t in) {
return vreinterpretq_s16_u16(vmovl_u8(in));
}
diff --git a/libgav1/src/dsp/arm/convolve_neon.cc b/libgav1/src/dsp/arm/convolve_neon.cc
index 2c2557f..331bfe2 100644
--- a/libgav1/src/dsp/arm/convolve_neon.cc
+++ b/libgav1/src/dsp/arm/convolve_neon.cc
@@ -35,9 +35,8 @@
namespace low_bitdepth {
namespace {
-constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kHorizontalOffset = 3;
-constexpr int kFilterIndexShift = 6;
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/convolve.inc"
// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
@@ -102,245 +101,278 @@
return vreinterpretq_s16_u16(sum);
}
-template <int filter_index, bool negative_outside_taps>
-int16x8_t SumHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- uint8x8_t v_src[8];
- const uint8x16_t src_long = vld1q_u8(src);
- int16x8_t sum;
-
- if (filter_index < 2) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 1);
- } else if (filter_index == 2) {
- v_src[0] = vget_low_u8(src_long);
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
- v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap);
- } else if (filter_index == 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 3);
- } else if (filter_index > 3) {
- v_src[0] = vget_low_u8(vextq_u8(src_long, src_long, 2));
- v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 3));
- v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 4));
- v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 5));
- sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src, v_tap + 2);
- }
- return sum;
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint8x8_t SimpleHorizontalTaps(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index, bool negative_outside_taps>
-uint16x8_t HorizontalTaps8To16(const uint8_t* const src,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps<filter_index, negative_outside_taps>(src, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int filter_index>
-int16x8_t SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- uint16x8_t sum;
- const uint8x8_t input0 = vld1_u8(src);
- src += src_stride;
- const uint8x8_t input1 = vld1_u8(src);
- uint8x8x2_t input = vzip_u8(input0, input1);
-
- if (filter_index == 3) {
- // tap signs : + +
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- } else if (filter_index == 4) {
- // tap signs : - + + -
- sum = vmull_u8(vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- } else {
- // tap signs : + + + +
- sum = vmull_u8(RightShift<4 * 8>(input.val[0]), v_tap[2]);
- sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[3]);
- sum = vmlal_u8(sum, input.val[1], v_tap[4]);
- sum = vmlal_u8(sum, RightShift<2 * 8>(input.val[1]), v_tap[5]);
- }
-
- return vreinterpretq_s16_u16(sum);
-}
-
-template <int filter_index>
-uint8x8_t SimpleHorizontalTaps2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- int16x8_t sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
- return vqrshrun_n_s16(sum, kFilterBits - 1);
-}
-
-template <int filter_index>
-uint16x8_t HorizontalTaps8To16_2x2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const uint8x8_t* const v_tap) {
- const int16x8_t sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- return vreinterpretq_u16_s16(
- vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
-}
-
-template <int num_taps, int step, int filter_index,
- bool negative_outside_taps = true, bool is_2d = false,
- bool is_compound = false>
-void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dest, const ptrdiff_t pred_stride,
- const int width, const int height,
- const uint8x8_t* const v_tap) {
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontalWidth8AndUp(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
auto* dest8 = static_cast<uint8_t*>(dest);
auto* dest16 = static_cast<uint16_t*>(dest);
-
- // 4 tap filters are never used when width > 4.
- if (num_taps != 4 && width > 4) {
- int y = 0;
+ if (!is_2d) {
+ int y = height;
do {
int x = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(src + x);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_compound) {
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
vst1q_u16(&dest16[x], v_sum);
} else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(&src[x],
- v_tap);
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
vst1_u8(&dest8[x], result);
}
- x += step;
+ x += 8;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
+ } else {
+ int x = 0;
+ do {
+ const uint8_t* s = src + x;
+ int y = height;
+ do { // Increasing loop counter x is better.
+ const uint8x16_t src_long = vld1q_u8(s);
+ uint8x8_t v_src[8];
+ int16x8_t sum;
+ if (filter_index < 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ sum = SumOnePassTaps<filter_index, negative_outside_taps>(v_src,
+ v_tap + 1);
+ } else if (filter_index == 2) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ v_src[4] = vget_low_u8(vextq_u8(src_long, src_long, 4));
+ v_src[5] = vget_low_u8(vextq_u8(src_long, src_long, 5));
+ v_src[6] = vget_low_u8(vextq_u8(src_long, src_long, 6));
+ v_src[7] = vget_low_u8(vextq_u8(src_long, src_long, 7));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap);
+ } else if (filter_index == 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else if (filter_index > 3) {
+ v_src[0] = vget_low_u8(src_long);
+ v_src[1] = vget_low_u8(vextq_u8(src_long, src_long, 1));
+ v_src[2] = vget_low_u8(vextq_u8(src_long, src_long, 2));
+ v_src[3] = vget_low_u8(vextq_u8(src_long, src_long, 3));
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ const uint16x8_t v_sum = vreinterpretq_u16_s16(
+ vrshrq_n_s16(sum, kInterRoundBitsHorizontal - 1));
+ vst1q_u16(dest16, v_sum);
+ s += src_stride;
+ dest16 += 8;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+template <int filter_index, bool is_2d, bool is_compound>
+void FilterHorizontalWidth4(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ uint8x8_t v_src[4];
+ int16x8_t sum;
+ v_src[0] = vld1_u8(src);
+ if (filter_index == 3) {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 3);
+ } else {
+ v_src[1] = RightShiftVector<1 * 8>(v_src[0]);
+ v_src[2] = RightShiftVector<2 * 8>(v_src[0]);
+ v_src[3] = RightShiftVector<3 * 8>(v_src[0]);
+ sum = SumOnePassTaps<filter_index, false>(v_src, v_tap + 2);
+ }
+ if (is_2d || is_compound) {
+ const uint16x4_t v_sum = vreinterpret_u16_s16(
+ vrshr_n_s16(vget_low_s16(sum), kInterRoundBitsHorizontal - 1));
+ vst1_u16(dest16, v_sum);
+ } else {
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ sum = vaddq_s16(sum, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(sum, kFilterBits - 1);
+ StoreLo4(&dest8[0], result);
+ }
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+}
+
+template <int filter_index, bool is_2d>
+void FilterHorizontalWidth2(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int height, const uint8x8_t* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+ int y = height >> 1;
+ do {
+ const uint8x8_t input0 = vld1_u8(src);
+ const uint8x8_t input1 = vld1_u8(src + src_stride);
+ const uint8x8x2_t input = vzip_u8(input0, input1);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ // tap signs : + +
+ sum = vmull_u8(input.val[0], v_tap[3]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 2), v_tap[4]);
+ } else if (filter_index == 4) {
+ // tap signs : - + + -
+ sum = vmull_u8(RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlsl_u8(sum, input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlsl_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ } else {
+ // tap signs : + + + +
+ sum = vmull_u8(input.val[0], v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input.val[0]), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<4 * 8>(input.val[0]), v_tap[4]);
+ sum = vmlal_u8(sum, vext_u8(input.val[0], input.val[1], 6), v_tap[5]);
+ }
+ int16x8_t s = vreinterpretq_s16_u16(sum);
+ if (is_2d) {
+ const uint16x8_t v_sum =
+ vreinterpretq_u16_s16(vrshrq_n_s16(s, kInterRoundBitsHorizontal - 1));
+ dest16[0] = vgetq_lane_u16(v_sum, 0);
+ dest16[1] = vgetq_lane_u16(v_sum, 2);
+ dest16 += pred_stride;
+ dest16[0] = vgetq_lane_u16(v_sum, 1);
+ dest16[1] = vgetq_lane_u16(v_sum, 3);
+ dest16 += pred_stride;
+ } else {
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift.
+ // Combining them requires adding the rounding offset from the skipped
+ // shift.
+ constexpr int first_shift_rounding_bit =
+ 1 << (kInterRoundBitsHorizontal - 2);
+ s = vaddq_s16(s, vdupq_n_s16(first_shift_rounding_bit));
+ const uint8x8_t result = vqrshrun_n_s16(s, kFilterBits - 1);
+ dest8[0] = vget_lane_u8(result, 0);
+ dest8[1] = vget_lane_u8(result, 2);
+ dest8 += pred_stride;
+ dest8[0] = vget_lane_u8(result, 1);
+ dest8[1] = vget_lane_u8(result, 3);
+ dest8 += pred_stride;
+ }
+ src += src_stride << 1;
+ } while (--y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ const uint8x8_t input = vld1_u8(src);
+ uint16x8_t sum;
+ if (filter_index == 3) {
+ sum = vmull_u8(input, v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[4]);
+ } else if (filter_index == 4) {
+ sum = vmull_u8(RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlsl_u8(sum, input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlsl_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ } else {
+ assert(filter_index == 5);
+ sum = vmull_u8(input, v_tap[2]);
+ sum = vmlal_u8(sum, RightShiftVector<1 * 8>(input), v_tap[3]);
+ sum = vmlal_u8(sum, RightShiftVector<2 * 8>(input), v_tap[4]);
+ sum = vmlal_u8(sum, RightShiftVector<3 * 8>(input), v_tap[5]);
+ }
+ // |sum| contains an int16_t value.
+ sum = vreinterpretq_u16_s16(vrshrq_n_s16(vreinterpretq_s16_u16(sum),
+ kInterRoundBitsHorizontal - 1));
+ Store2<0>(dest16, sum);
+ }
+}
+
+template <int filter_index, bool negative_outside_taps, bool is_2d,
+ bool is_compound>
+void FilterHorizontal(const uint8_t* const src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const uint8x8_t* const v_tap) {
+ assert(width < 8 || filter_index <= 3);
+ // Don't simplify the redundant if conditions with the template parameters,
+ // which helps the compiler generate compact code.
+ if (width >= 8 && filter_index <= 3) {
+ FilterHorizontalWidth8AndUp<filter_index, negative_outside_taps, is_2d,
+ is_compound>(src, src_stride, dest, pred_stride,
+ width, height, v_tap);
return;
}
- // Horizontal passes only needs to account for |num_taps| 2 and 4 when
+ // Horizontal passes only needs to account for number of taps 2 and 4 when
// |width| <= 4.
assert(width <= 4);
- assert(num_taps <= 4);
- if (num_taps <= 4) {
+ assert(filter_index >= 3 && filter_index <= 5);
+ if (filter_index >= 3 && filter_index <= 5) {
if (width == 4) {
- int y = 0;
- do {
- if (is_2d || is_compound) {
- const uint16x8_t v_sum =
- HorizontalTaps8To16<filter_index, negative_outside_taps>(src,
- v_tap);
- vst1_u16(dest16, vget_low_u16(v_sum));
- } else {
- const uint8x8_t result =
- SimpleHorizontalTaps<filter_index, negative_outside_taps>(src,
- v_tap);
- StoreLo4(&dest8[0], result);
- }
- src += src_stride;
- dest8 += pred_stride;
- dest16 += pred_stride;
- } while (++y < height);
+ FilterHorizontalWidth4<filter_index, is_2d, is_compound>(
+ src, src_stride, dest, pred_stride, height, v_tap);
return;
}
-
+ assert(width == 2);
if (!is_compound) {
- int y = 0;
- do {
- if (is_2d) {
- const uint16x8_t sum =
- HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
- dest16[0] = vgetq_lane_u16(sum, 0);
- dest16[1] = vgetq_lane_u16(sum, 2);
- dest16 += pred_stride;
- dest16[0] = vgetq_lane_u16(sum, 1);
- dest16[1] = vgetq_lane_u16(sum, 3);
- dest16 += pred_stride;
- } else {
- const uint8x8_t sum =
- SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- dest8[0] = vget_lane_u8(sum, 0);
- dest8[1] = vget_lane_u8(sum, 2);
- dest8 += pred_stride;
-
- dest8[0] = vget_lane_u8(sum, 1);
- dest8[1] = vget_lane_u8(sum, 3);
- dest8 += pred_stride;
- }
-
- src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
-
- // The 2d filters have an odd |height| because the horizontal pass
- // generates context for the vertical pass.
- if (is_2d) {
- assert(height % 2 == 1);
- uint16x8_t sum;
- const uint8x8_t input = vld1_u8(src);
- if (filter_index == 3) { // |num_taps| == 2
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- } else if (filter_index == 4) {
- sum = vmull_u8(RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlsl_u8(sum, RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlsl_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- } else {
- assert(filter_index == 5);
- sum = vmull_u8(RightShift<2 * 8>(input), v_tap[2]);
- sum = vmlal_u8(sum, RightShift<3 * 8>(input), v_tap[3]);
- sum = vmlal_u8(sum, RightShift<4 * 8>(input), v_tap[4]);
- sum = vmlal_u8(sum, RightShift<5 * 8>(input), v_tap[5]);
- }
- // |sum| contains an int16_t value.
- sum = vreinterpretq_u16_s16(vrshrq_n_s16(
- vreinterpretq_s16_u16(sum), kInterRoundBitsHorizontal - 1));
- Store2<0>(dest16, sum);
- }
+ FilterHorizontalWidth2<filter_index, is_2d>(src, src_stride, dest,
+ pred_stride, height, v_tap);
}
}
}
@@ -452,78 +484,85 @@
}
template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int width,
- const int height, const int16x8_t taps) {
+void Filter2DVerticalWidth8AndUp(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const int16x8_t taps) {
assert(width >= 8);
constexpr int next_row = num_taps - 1;
- // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
- const ptrdiff_t src_stride = width;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
int x = 0;
do {
- int16x8_t srcs[8];
- const uint16_t* src_x = src + x;
- srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ int16x8_t srcs[9];
+ srcs[0] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 4) {
- srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[2] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps >= 6) {
- srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[3] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[4] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
if (num_taps == 8) {
- srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
- srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
+ srcs[5] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[6] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
}
}
}
- int y = 0;
+ uint8_t* d8 = dst8 + x;
+ uint16_t* d16 = dst16 + x;
+ int y = height;
do {
- srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src_x));
- src_x += src_stride;
-
- const int16x8_t sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ srcs[next_row] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ srcs[next_row + 1] = vreinterpretq_s16_u16(vld1q_u16(src));
+ src += 8;
+ const int16x8_t sum0 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 0, taps);
+ const int16x8_t sum1 =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs + 1, taps);
if (is_compound) {
- vst1q_u16(dst16 + x + y * dst_stride, vreinterpretq_u16_s16(sum));
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum0));
+ d16 += dst_stride;
+ vst1q_u16(d16, vreinterpretq_u16_s16(sum1));
+ d16 += dst_stride;
} else {
- vst1_u8(dst8 + x + y * dst_stride, vqmovun_s16(sum));
+ vst1_u8(d8, vqmovun_s16(sum0));
+ d8 += dst_stride;
+ vst1_u8(d8, vqmovun_s16(sum1));
+ d8 += dst_stride;
}
-
- srcs[0] = srcs[1];
+ srcs[0] = srcs[2];
if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
}
}
}
- } while (++y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
// Take advantage of |src_stride| == |width| to process two rows at a time.
template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth4(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
auto* dst8 = static_cast<uint8_t*>(dst);
auto* dst16 = static_cast<uint16_t*>(dst);
@@ -546,7 +585,7 @@
}
}
- int y = 0;
+ int y = height;
do {
srcs[num_taps] = vreinterpretq_s16_u16(vld1q_u16(src));
src += 8;
@@ -581,15 +620,15 @@
}
}
}
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
// Take advantage of |src_stride| == |width| to process four rows at a time.
template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const int16x8_t taps) {
+void Filter2DVerticalWidth2(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const int16x8_t taps) {
constexpr int next_row = (num_taps < 6) ? 4 : 8;
auto* dst8 = static_cast<uint8_t*>(dst);
@@ -662,11 +701,10 @@
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
const ptrdiff_t dst_stride, const int width, const int height,
- const int subpixel, const int filter_index) {
+ const int filter_id, const int filter_index) {
// Duplicate the absolute value for each tap. Negative taps are corrected
// by using the vmlsl_u8 instruction. Positive taps use vmlal_u8.
uint8x8_t v_tap[kSubPixelTaps];
- const int filter_id = (subpixel >> 6) & kSubPixelMask;
assert(filter_id != 0);
for (int k = 0; k < kSubPixelTaps; ++k) {
@@ -674,67 +712,58 @@
}
if (filter_index == 2) { // 8 tap.
- FilterHorizontal<8, 8, 2, true, is_2d, is_compound>(
+ FilterHorizontal<2, true, is_2d, is_compound>(
src, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
// Check if outside taps are positive.
if ((filter_id == 1) | (filter_id == 15)) {
- FilterHorizontal<6, 8, 1, false, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, false, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else {
- FilterHorizontal<6, 8, 1, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<1, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
}
} else if (filter_index == 0) { // 6 tap.
- FilterHorizontal<6, 8, 0, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<0, true, is_2d, is_compound>(
+ src + 1, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
- FilterHorizontal<4, 8, 4, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
- FilterHorizontal<4, 8, 5, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<5, true, is_2d, is_compound>(
+ src + 2, src_stride, dst, dst_stride, width, height, v_tap);
} else { // 2 tap.
- FilterHorizontal<2, 8, 3, true, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<3, true, is_2d, is_compound>(
+ src + 3, src_stride, dst, dst_stride, width, height, v_tap);
}
}
-int GetNumTapsInFilter(const int filter_index) {
- if (filter_index < 2) {
- // Despite the names these only use 6 taps.
- // kInterpolationFilterEightTap
- // kInterpolationFilterEightTapSmooth
- return 6;
+template <int vertical_taps>
+void Filter2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction, const ptrdiff_t pred_stride) {
+ auto* const dest = static_cast<uint8_t*>(prediction);
+ if (width >= 8) {
+ Filter2DVerticalWidth8AndUp<vertical_taps>(
+ intermediate_result, dest, pred_stride, width, height, taps);
+ } else if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
+ } else {
+ assert(width == 2);
+ Filter2DVerticalWidth2<vertical_taps>(intermediate_result, dest,
+ pred_stride, height, taps);
}
-
- if (filter_index == 2) {
- // kInterpolationFilterEightTapSharp
- return 8;
- }
-
- if (filter_index == 3) {
- // kInterpolationFilterBilinear
- return 2;
- }
-
- assert(filter_index > 3);
- // For small sizes (width/height <= 4) the large filters are replaced with 4
- // tap options.
- // If the original filters were |kInterpolationFilterEightTap| or
- // |kInterpolationFilterEightTapSharp| then it becomes
- // |kInterpolationFilterSwitchable|.
- // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
- // tap filter.
- return 4;
}
void Convolve2D_NEON(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
- const int vertical_filter_index, const int subpixel_x,
- const int subpixel_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
@@ -744,68 +773,31 @@
intermediate_result[kMaxSuperBlockSizeInPixels *
(kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
const int intermediate_height = height + vertical_taps - 1;
-
const ptrdiff_t src_stride = reference_stride;
- const auto* src = static_cast<const uint8_t*>(reference) -
- (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ const auto* const src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride -
+ kHorizontalOffset;
DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
- width, intermediate_height, subpixel_x,
- horiz_filter_index);
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint8_t*>(prediction);
- const ptrdiff_t dest_stride = pred_stride;
- const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- assert(filter_id != 0);
-
- const int16x8_t taps =
- vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
-
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
if (vertical_taps == 8) {
- if (width == 2) {
- Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<8>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 6) {
- if (width == 2) {
- Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<6>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else if (vertical_taps == 4) {
- if (width == 2) {
- Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<4>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
} else { // |vertical_taps| == 2
- if (width == 2) {
- Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else if (width == 4) {
- Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
- taps);
- } else {
- Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
- taps);
- }
+ Filter2DVertical<2>(intermediate_result, width, height, taps, prediction,
+ pred_stride);
}
}
@@ -818,7 +810,7 @@
// increments. The first load covers the initial elements of src_x, while the
// final load covers the taps.
template <int grade_x>
-inline uint8x8x3_t LoadSrcVals(const uint8_t* src_x) {
+inline uint8x8x3_t LoadSrcVals(const uint8_t* const src_x) {
uint8x8x3_t ret;
const uint8x16_t src_val = vld1q_u8(src_x);
ret.val[0] = vget_low_u8(src_val);
@@ -841,7 +833,7 @@
}
template <int grade_x>
-inline void ConvolveKernelHorizontal2Tap(const uint8_t* src,
+inline void ConvolveKernelHorizontal2Tap(const uint8_t* const src,
const ptrdiff_t src_stride,
const int width, const int subpixel_x,
const int step_x,
@@ -873,7 +865,7 @@
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -890,7 +882,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
return;
}
@@ -913,7 +905,7 @@
// on x.
const uint8x8_t taps[2] = {VQTbl1U8(filter_taps0, filter_indices),
VQTbl1U8(filter_taps1, filter_indices)};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -930,7 +922,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -951,7 +943,7 @@
// This filter is only possible when width <= 4.
void ConvolveKernelHorizontalPositive4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -980,7 +972,7 @@
const uint8x8_t src_indices =
vmovn_u16(vshrq_n_u16(subpel_index_offsets, kScaleSubPixelBits));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped index vectors.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1000,7 +992,7 @@
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 4 tap filters in |kAbsHalfSubPixelFilters|[4].
@@ -1018,7 +1010,7 @@
// This filter is only possible when width <= 4.
inline void ConvolveKernelHorizontalSigned4Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int subpixel_x,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int subpixel_x,
const int step_x, const int intermediate_height, int16_t* intermediate) {
const int kernel_offset = 2;
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1055,7 +1047,7 @@
vadd_u8(src_indices_base, vdup_n_u8(2)),
vadd_u8(src_indices_base, vdup_n_u8(3))};
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x16_t src_vals = vld1q_u8(src_x);
@@ -1072,7 +1064,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
}
// Pre-transpose the 6 tap filters in |kAbsHalfSubPixelFilters|[0].
@@ -1093,9 +1085,9 @@
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1137,7 +1129,7 @@
for (int i = 0; i < 6; ++i) {
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1152,7 +1144,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1186,9 +1178,9 @@
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalMixed6Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const int kernel_offset = 1;
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
@@ -1235,7 +1227,7 @@
mixed_taps[0] = vmovl_s8(VQTbl1S8(mixed_filter_taps[0], filter_indices));
mixed_taps[1] = vmovl_s8(VQTbl1S8(mixed_filter_taps[1], filter_indices));
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1254,7 +1246,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1280,9 +1272,9 @@
// This filter is only possible when width >= 8.
template <int grade_x>
inline void ConvolveKernelHorizontalSigned8Tap(
- const uint8_t* src, const ptrdiff_t src_stride, const int width,
+ const uint8_t* const src, const ptrdiff_t src_stride, const int width,
const int subpixel_x, const int step_x, const int intermediate_height,
- int16_t* intermediate) {
+ int16_t* const intermediate) {
const uint8x8_t one = vdup_n_u8(1);
const uint8x8_t filter_index_mask = vdup_n_u8(kSubPixelMask);
const int ref_x = subpixel_x >> kScaleSubPixelBits;
@@ -1320,7 +1312,7 @@
taps[i] = VQTbl1U8(filter_taps[i], filter_indices);
}
- int y = 0;
+ int y = intermediate_height;
do {
// Load a pool of samples to select from using stepped indices.
const uint8x8x3_t src_vals = LoadSrcVals<grade_x>(src_x);
@@ -1336,7 +1328,7 @@
kInterRoundBitsHorizontal - 1));
src_x += src_stride;
intermediate_x += kIntermediateStride;
- } while (++y < intermediate_height);
+ } while (--y != 0);
x += 8;
p += step_x8;
} while (x < width);
@@ -1344,9 +1336,9 @@
// This function handles blocks of width 2 or 4.
template <int num_taps, int grade_y, int width, bool is_compound>
-void ConvolveVerticalScale4xH(const int16_t* src, const int subpixel_y,
+void ConvolveVerticalScale4xH(const int16_t* const src, const int subpixel_y,
const int filter_index, const int step_y,
- const int height, void* dest,
+ const int height, void* const dest,
const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
const int16_t* src_y = src;
@@ -1357,8 +1349,8 @@
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1_s16(src_y + i * src_stride);
}
@@ -1411,16 +1403,16 @@
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
template <int num_taps, int grade_y, bool is_compound>
-inline void ConvolveVerticalScale(const int16_t* src, const int width,
+inline void ConvolveVerticalScale(const int16_t* const src, const int width,
const int subpixel_y, const int filter_index,
const int step_y, const int height,
- void* dest, const ptrdiff_t dest_stride) {
+ void* const dest,
+ const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
// A possible improvement is to use arithmetic to decide how many times to
// apply filters to same source before checking whether to load new srcs.
@@ -1431,15 +1423,15 @@
uint8_t* dest_y;
int x = 0;
- do { // x < width
- const int16_t* src_x = src + x;
+ do {
+ const int16_t* const src_x = src + x;
const int16_t* src_y = src_x;
dest16_y = static_cast<uint16_t*>(dest) + x;
dest_y = static_cast<uint8_t*>(dest) + x;
int p = subpixel_y & 1023;
int prev_p = p;
- int y = 0;
- do { // y < height
+ int y = height;
+ do {
for (int i = 0; i < num_taps; ++i) {
s[i] = vld1q_s16(src_y + i * src_stride);
}
@@ -1478,9 +1470,8 @@
prev_p = p;
dest16_y += dest_stride;
dest_y += dest_stride;
-
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
x += 8;
} while (x < width);
}
@@ -1492,7 +1483,7 @@
const int vertical_filter_index, const int subpixel_x,
const int subpixel_y, const int step_x,
const int step_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ void* const prediction, const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
assert(step_x <= 2048);
@@ -1727,16 +1718,18 @@
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
// Set |src| to the outermost tap.
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint8_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint8_t*>(prediction);
DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
- subpixel_x, filter_index);
+ horizontal_filter_id, filter_index);
}
// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
@@ -1748,14 +1741,14 @@
template <int filter_index, bool is_compound = false,
bool negative_outside_taps = false>
-void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
+void FilterVertical(const uint8_t* const src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
const int width, const int height,
const uint8x8_t* const taps) {
const int num_taps = GetNumTapsInFilter(filter_index);
const int next_row = num_taps - 1;
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
+ auto* const dst8 = static_cast<uint8_t*>(dst);
+ auto* const dst16 = static_cast<uint16_t*>(dst);
assert(width >= 8);
int x = 0;
@@ -1783,6 +1776,9 @@
}
}
+ // Decreasing the y loop counter produces worse code with clang.
+ // Don't unroll this loop since it generates too much code and the decoder
+ // is even slower.
int y = 0;
do {
srcs[next_row] = vld1_u8(src_x);
@@ -1833,7 +1829,7 @@
srcs[0] = Load4(src);
src += src_stride;
- int y = 0;
+ int y = height;
do {
srcs[0] = Load4<1>(src, srcs[0]);
src += src_stride;
@@ -1858,8 +1854,8 @@
}
srcs[0] = srcs[2];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 4) {
srcs[4] = vdup_n_u8(0);
@@ -1871,7 +1867,7 @@
src += src_stride;
srcs[1] = vext_u8(srcs[0], srcs[2], 4);
- int y = 0;
+ int y = height;
do {
srcs[2] = Load4<1>(src, srcs[2]);
src += src_stride;
@@ -1898,8 +1894,8 @@
srcs[0] = srcs[2];
srcs[1] = srcs[3];
srcs[2] = srcs[4];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 6) {
srcs[6] = vdup_n_u8(0);
@@ -1916,7 +1912,7 @@
src += src_stride;
srcs[3] = vext_u8(srcs[2], srcs[4], 4);
- int y = 0;
+ int y = height;
do {
srcs[4] = Load4<1>(src, srcs[4]);
src += src_stride;
@@ -1945,8 +1941,8 @@
srcs[2] = srcs[4];
srcs[3] = srcs[5];
srcs[4] = srcs[6];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else if (num_taps == 8) {
srcs[8] = vdup_n_u8(0);
@@ -1968,7 +1964,7 @@
src += src_stride;
srcs[5] = vext_u8(srcs[4], srcs[6], 4);
- int y = 0;
+ int y = height;
do {
srcs[6] = Load4<1>(src, srcs[6]);
src += src_stride;
@@ -1999,8 +1995,8 @@
srcs[4] = srcs[6];
srcs[5] = srcs[7];
srcs[6] = srcs[8];
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2213,22 +2209,23 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y,
- const int width, const int height, void* prediction,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* const prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint8_t*>(prediction);
+ auto* const dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
uint8x8_t taps[8];
for (int k = 0; k < kSubPixelTaps; ++k) {
- taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
}
if (filter_index == 0) { // 6 tap.
@@ -2242,8 +2239,8 @@
FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
taps + 1);
}
- } else if ((filter_index == 1) &
- ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
+ } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+ (vertical_filter_id == 15))) { // 5 tap.
if (width == 2) {
FilterVertical2xH<1>(src, src_stride, dest, dest_stride, height,
taps + 1);
@@ -2255,8 +2252,8 @@
taps + 1);
}
} else if ((filter_index == 1) &
- ((filter_id == 7) | (filter_id == 8) |
- (filter_id == 9))) { // 6 tap with weird negative taps.
+ ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+ (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
if (width == 2) {
FilterVertical2xH<1,
/*negative_outside_taps=*/true>(
@@ -2302,14 +2299,15 @@
taps + 2);
}
} else {
- // 4 tap. When |filter_index| == 1 the |filter_id| values listed below map
- // to 4 tap filters.
+ // 4 tap. When |filter_index| == 1 the |vertical_filter_id| values listed
+ // below map to 4 tap filters.
assert(filter_index == 5 ||
(filter_index == 1 &&
- (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
- filter_id == 5 || filter_id == 6 || filter_id == 10 ||
- filter_id == 11 || filter_id == 12 || filter_id == 13 ||
- filter_id == 14)));
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
// According to GetNumTapsInFilter() this has 6 taps but here we are
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
@@ -2329,8 +2327,9 @@
void ConvolveCompoundCopy_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t /*pred_stride*/) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -2338,7 +2337,7 @@
kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
if (width >= 16) {
- int y = 0;
+ int y = height;
do {
int x = 0;
do {
@@ -2354,20 +2353,20 @@
} while (x < width);
src += src_stride;
dest += width;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t v_src = vld1_u8(&src[0]);
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
src += src_stride;
dest += width;
- } while (++y < height);
- } else { /* width == 4 */
+ } while (--y != 0);
+ } else { // width == 4
uint8x8_t v_src = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
v_src = Load4<0>(&src[0], v_src);
src += src_stride;
@@ -2376,28 +2375,29 @@
const uint16x8_t v_dest = vshll_n_u8(v_src, final_shift);
vst1q_u16(&dest[0], v_dest);
dest += 4 << 1;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
void ConvolveCompoundVertical_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
- auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ assert(vertical_filter_id != 0);
uint8x8_t taps[8];
for (int k = 0; k < kSubPixelTaps; ++k) {
- taps[k] = vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][filter_id][k]);
+ taps[k] =
+ vdup_n_u8(kAbsHalfSubPixelFilters[filter_index][vertical_filter_id][k]);
}
if (filter_index == 0) { // 6 tap.
@@ -2408,8 +2408,8 @@
FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps + 1);
}
- } else if ((filter_index == 1) &
- ((filter_id == 1) | (filter_id == 15))) { // 5 tap.
+ } else if ((filter_index == 1) & ((vertical_filter_id == 1) |
+ (vertical_filter_id == 15))) { // 5 tap.
if (width == 4) {
FilterVertical4xH<1, /*is_compound=*/true>(src, src_stride, dest, 4,
height, taps + 1);
@@ -2418,8 +2418,8 @@
width, height, taps + 1);
}
} else if ((filter_index == 1) &
- ((filter_id == 7) | (filter_id == 8) |
- (filter_id == 9))) { // 6 tap with weird negative taps.
+ ((vertical_filter_id == 7) | (vertical_filter_id == 8) |
+ (vertical_filter_id == 9))) { // 6 tap with weird negative taps.
if (width == 4) {
FilterVertical4xH<1, /*is_compound=*/true,
/*negative_outside_taps=*/true>(src, src_stride, dest,
@@ -2457,10 +2457,11 @@
// to 4 tap filters.
assert(filter_index == 5 ||
(filter_index == 1 &&
- (filter_id == 2 || filter_id == 3 || filter_id == 4 ||
- filter_id == 5 || filter_id == 6 || filter_id == 10 ||
- filter_id == 11 || filter_id == 12 || filter_id == 13 ||
- filter_id == 14)));
+ (vertical_filter_id == 2 || vertical_filter_id == 3 ||
+ vertical_filter_id == 4 || vertical_filter_id == 5 ||
+ vertical_filter_id == 6 || vertical_filter_id == 10 ||
+ vertical_filter_id == 11 || vertical_filter_id == 12 ||
+ vertical_filter_id == 13 || vertical_filter_id == 14)));
// According to GetNumTapsInFilter() this has 6 taps but here we are
// treating it as though it has 4.
if (filter_index == 1) src += src_stride;
@@ -2477,22 +2478,41 @@
void ConvolveCompoundHorizontal_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
- const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
- auto* dest = static_cast<uint16_t*>(prediction);
+ const auto* const src =
+ static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* const dest = static_cast<uint16_t*>(prediction);
DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
- src, reference_stride, dest, width, width, height, subpixel_x,
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
filter_index);
}
-void ConvolveCompound2D_NEON(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int vertical_filter_index,
- const int subpixel_x, const int subpixel_y, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+template <int vertical_taps>
+void Compound2DVertical(const uint16_t* const intermediate_result,
+ const int width, const int height, const int16x8_t taps,
+ void* const prediction) {
+ auto* const dest = static_cast<uint16_t*>(prediction);
+ if (width == 4) {
+ Filter2DVerticalWidth4<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, height, taps);
+ } else {
+ Filter2DVerticalWidth8AndUp<vertical_taps, /*is_compound=*/true>(
+ intermediate_result, dest, width, width, height, taps);
+ }
+}
+
+void ConvolveCompound2D_NEON(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* const prediction,
+ const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
uint16_t
@@ -2512,56 +2532,26 @@
const auto* const src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride -
kHorizontalOffset;
-
DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
src, src_stride, intermediate_result, width, width, intermediate_height,
- subpixel_x, horiz_filter_index);
+ horizontal_filter_id, horiz_filter_index);
// Vertical filter.
- auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- assert(filter_id != 0);
-
- const ptrdiff_t dest_stride = width;
- const int16x8_t taps =
- vmovl_s8(vld1_s8(kHalfSubPixelFilters[vert_filter_index][filter_id]));
-
+ assert(vertical_filter_id != 0);
+ const int16x8_t taps = vmovl_s8(
+ vld1_s8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]));
if (vertical_taps == 8) {
- if (width == 4) {
- Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<8, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<8>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 6) {
- if (width == 4) {
- Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<6, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<6>(intermediate_result, width, height, taps, prediction);
} else if (vertical_taps == 4) {
- if (width == 4) {
- Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<4, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<4>(intermediate_result, width, height, taps, prediction);
} else { // |vertical_taps| == 2
- if (width == 4) {
- Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
- dest_stride, height, taps);
- } else {
- Filter2DVertical<2, /*is_compound=*/true>(
- intermediate_result, dest, dest_stride, width, height, taps);
- }
+ Compound2DVertical<2>(intermediate_result, width, height, taps, prediction);
}
}
-inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+inline void HalfAddHorizontal(const uint8_t* const src, uint8_t* const dst) {
const uint8x16_t left = vld1q_u8(src);
const uint8x16_t right = vld1q_u8(src + 1);
vst1q_u8(dst, vrhaddq_u8(left, right));
@@ -2575,7 +2565,7 @@
const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
- int y = 0;
+ int y = height;
do {
HalfAddHorizontal(src, dst);
if (width >= 32) {
@@ -2607,7 +2597,7 @@
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyHorizontal_NEON(
@@ -2631,7 +2621,7 @@
IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
pred_stride);
} else if (width == 8) {
- int y = 0;
+ int y = height;
do {
const uint8x8_t left = vld1_u8(src);
const uint8x8_t right = vld1_u8(src + 1);
@@ -2639,11 +2629,11 @@
src += reference_stride;
dest += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -2658,13 +2648,13 @@
dest += pred_stride;
StoreHi4(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
assert(width == 2);
uint8x8_t left = vdup_n_u8(0);
uint8x8_t right = vdup_n_u8(0);
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -2679,8 +2669,8 @@
dest += pred_stride;
Store2<1>(dest, result);
dest += pred_stride;
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -2715,7 +2705,7 @@
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
below[0] = vld1q_u8(src);
if (width >= 32) {
@@ -2770,14 +2760,15 @@
}
}
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopyVertical_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -2798,7 +2789,7 @@
row = vld1_u8(src);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = vld1_u8(src);
src += reference_stride;
@@ -2807,13 +2798,13 @@
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else if (width == 4) {
uint8x8_t row = Load4(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load4<0>(src, below);
src += reference_stride;
@@ -2822,14 +2813,14 @@
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
} else {
assert(width == 2);
uint8x8_t row = Load2(src);
uint8x8_t below = vdup_n_u8(0);
src += reference_stride;
- int y = 0;
+ int y = height;
do {
below = Load2<0>(src, below);
src += reference_stride;
@@ -2838,7 +2829,7 @@
dest += pred_stride;
row = below;
- } while (++y < height);
+ } while (--y != 0);
}
}
@@ -2890,7 +2881,7 @@
}
src += src_remainder_stride;
- int y = 0;
+ int y = height;
do {
const uint16x8_t below_0 = vaddl_u8(vld1_u8(src), vld1_u8(src + 1));
vst1_u8(dst, vrshrn_n_u16(vaddq_u16(row[0], below_0), 2));
@@ -3001,14 +2992,15 @@
}
src += src_remainder_stride;
dst += dst_remainder_stride;
- } while (++y < height);
+ } while (--y != 0);
}
void ConvolveIntraBlockCopy2D_NEON(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
// Note: allow vertical access to height + 1. Because this function is only
@@ -3032,7 +3024,7 @@
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load4<0>(src, left);
right = Load4<0>(src + 1, right);
@@ -3051,8 +3043,8 @@
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
} else {
uint8x8_t left = Load2(src);
uint8x8_t right = Load2(src + 1);
@@ -3060,7 +3052,7 @@
uint16x4_t row = vget_low_u16(vaddl_u8(left, right));
- int y = 0;
+ int y = height;
do {
left = Load2<0>(src, left);
right = Load2<0>(src + 1, right);
@@ -3079,8 +3071,8 @@
dest += pred_stride;
row = vget_high_u16(below);
- y += 2;
- } while (y < height);
+ y -= 2;
+ } while (y != 0);
}
}
@@ -3112,7 +3104,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
index 04952ab..a0cd0ac 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.cc
@@ -30,10 +30,12 @@
namespace libgav1 {
namespace dsp {
-namespace {
constexpr int kInterPostRoundBit = 4;
+namespace low_bitdepth {
+namespace {
+
inline int16x8_t ComputeWeightedAverage8(const int16x8_t pred0,
const int16x8_t pred1,
const int16x4_t weights[2]) {
@@ -185,13 +187,167 @@
}
} // namespace
+} // namespace low_bitdepth
-void DistanceWeightedBlendInit_NEON() { Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+inline uint16x4x2_t ComputeWeightedAverage8(const uint16x4x2_t pred0,
+ const uint16x4x2_t pred1,
+ const uint16x4_t weights[2]) {
+ const uint32x4_t wpred0_lo = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred0_hi = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended_lo = vmlal_u16(wpred0_lo, weights[1], pred1.val[0]);
+ const uint32x4_t blended_hi = vmlal_u16(wpred0_hi, weights[1], pred1.val[1]);
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const int32x4_t res_lo = vsubq_s32(vreinterpretq_s32_u32(blended_lo), offset);
+ const int32x4_t res_hi = vsubq_s32(vreinterpretq_s32_u32(blended_hi), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x2_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res_lo, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res_hi, kInterPostRoundBit + 4), bd_max);
+ return result;
+}
+
+inline uint16x4x4_t ComputeWeightedAverage8(const uint16x4x4_t pred0,
+ const uint16x4x4_t pred1,
+ const uint16x4_t weights[2]) {
+ const int32x4_t offset = vdupq_n_s32(kCompoundOffset * 16);
+ const uint32x4_t wpred0 = vmull_u16(weights[0], pred0.val[0]);
+ const uint32x4_t wpred1 = vmull_u16(weights[0], pred0.val[1]);
+ const uint32x4_t blended0 = vmlal_u16(wpred0, weights[1], pred1.val[0]);
+ const uint32x4_t blended1 = vmlal_u16(wpred1, weights[1], pred1.val[1]);
+ const int32x4_t res0 = vsubq_s32(vreinterpretq_s32_u32(blended0), offset);
+ const int32x4_t res1 = vsubq_s32(vreinterpretq_s32_u32(blended1), offset);
+ const uint32x4_t wpred2 = vmull_u16(weights[0], pred0.val[2]);
+ const uint32x4_t wpred3 = vmull_u16(weights[0], pred0.val[3]);
+ const uint32x4_t blended2 = vmlal_u16(wpred2, weights[1], pred1.val[2]);
+ const uint32x4_t blended3 = vmlal_u16(wpred3, weights[1], pred1.val[3]);
+ const int32x4_t res2 = vsubq_s32(vreinterpretq_s32_u32(blended2), offset);
+ const int32x4_t res3 = vsubq_s32(vreinterpretq_s32_u32(blended3), offset);
+ const uint16x4_t bd_max = vdup_n_u16((1 << kBitdepth10) - 1);
+ // Clip the result at (1 << bd) - 1.
+ uint16x4x4_t result;
+ result.val[0] =
+ vmin_u16(vqrshrun_n_s32(res0, kInterPostRoundBit + 4), bd_max);
+ result.val[1] =
+ vmin_u16(vqrshrun_n_s32(res1, kInterPostRoundBit + 4), bd_max);
+ result.val[2] =
+ vmin_u16(vqrshrun_n_s32(res2, kInterPostRoundBit + 4), bd_max);
+ result.val[3] =
+ vmin_u16(vqrshrun_n_s32(res3, kInterPostRoundBit + 4), bd_max);
+
+ return result;
+}
+
+// We could use vld1_u16_x2, but for compatibility reasons, use this function
+// instead. The compiler optimizes to the correct instruction.
+inline uint16x4x2_t LoadU16x4_x2(uint16_t const* ptr) {
+ uint16x4x2_t x;
+ // gcc/clang (64 bit) optimizes the following to ldp.
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ return x;
+}
+
+// We could use vld1_u16_x4, but for compatibility reasons, use this function
+// instead. The compiler optimizes to a pair of vld1_u16_x2, which showed better
+// performance in the speed tests.
+inline uint16x4x4_t LoadU16x4_x4(uint16_t const* ptr) {
+ uint16x4x4_t x;
+ x.val[0] = vld1_u16(ptr);
+ x.val[1] = vld1_u16(ptr + 4);
+ x.val[2] = vld1_u16(ptr + 8);
+ x.val[3] = vld1_u16(ptr + 12);
+ return x;
+}
+
+void DistanceWeightedBlend_NEON(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0, const uint8_t weight_1,
+ const int width, const int height,
+ void* const dest, const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const uint16x4_t weights[2] = {vdup_n_u16(weight_0), vdup_n_u16(weight_1)};
+
+ if (width == 4) {
+ int y = height;
+ do {
+ const uint16x4x2_t src0 = LoadU16x4_x2(pred_0);
+ const uint16x4x2_t src1 = LoadU16x4_x2(pred_1);
+ const uint16x4x2_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + dst_stride, res.val[1]);
+ dst += dst_stride << 1;
+ pred_0 += 8;
+ pred_1 += 8;
+ y -= 2;
+ } while (y != 0);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst, res.val[0]);
+ vst1_u16(dst + 4, res.val[1]);
+ vst1_u16(dst + dst_stride, res.val[2]);
+ vst1_u16(dst + dst_stride + 4, res.val[3]);
+ dst += dst_stride << 1;
+ pred_0 += 16;
+ pred_1 += 16;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const uint16x4x4_t src0 = LoadU16x4_x4(pred_0 + x);
+ const uint16x4x4_t src1 = LoadU16x4_x4(pred_1 + x);
+ const uint16x4x4_t res = ComputeWeightedAverage8(src0, src1, weights);
+ vst1_u16(dst + x, res.val[0]);
+ vst1_u16(dst + x + 4, res.val[1]);
+ vst1_u16(dst + x + 8, res.val[2]);
+ vst1_u16(dst + x + 12, res.val[3]);
+ x += 16;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->distance_weighted_blend = DistanceWeightedBlend_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
index 4d8824c..94a799c 100644
--- a/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
+++ b/libgav1/src/dsp/arm/distance_weighted_blend_neon.h
@@ -34,6 +34,8 @@
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_DISTANCE_WEIGHTED_BLEND_NEON_H_
diff --git a/libgav1/src/dsp/arm/film_grain_neon.cc b/libgav1/src/dsp/arm/film_grain_neon.cc
index 2612466..8ee3745 100644
--- a/libgav1/src/dsp/arm/film_grain_neon.cc
+++ b/libgav1/src/dsp/arm/film_grain_neon.cc
@@ -1176,7 +1176,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intra_edge_neon.cc b/libgav1/src/dsp/arm/intra_edge_neon.cc
index 00b186a..074283f 100644
--- a/libgav1/src/dsp/arm/intra_edge_neon.cc
+++ b/libgav1/src/dsp/arm/intra_edge_neon.cc
@@ -25,7 +25,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
-#include "src/utils/common.h" // RightShiftWithRounding()
+#include "src/utils/common.h"
namespace libgav1 {
namespace dsp {
@@ -35,6 +35,11 @@
// required.
constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
assert(strength == 1 || strength == 2 || strength == 3);
const int kernel_index = strength - 1;
@@ -44,6 +49,8 @@
// elements written is |size| - 1.
if (size == 1) return;
+ const uint8x16_t v_index = vcombine_u8(vcreate_u8(0x0706050403020100),
+ vcreate_u8(0x0f0e0d0c0b0a0908));
// |strength| 1 and 2 use a 3 tap filter.
if (strength < 3) {
// The last value requires extending the buffer (duplicating
@@ -89,7 +96,6 @@
// |remainder| == 1 then we don't have to do anything.
const int remainder = (size - 1) & 0xf;
if (remainder > 1) {
- uint8_t temp[16];
const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
@@ -102,9 +108,11 @@
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_1, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[size - 1] = last_val;
@@ -173,7 +181,6 @@
// Like the 3 tap but if there are two remaining values we have already
// calculated them.
if (remainder > 2) {
- uint8_t temp[16];
const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
@@ -193,9 +200,11 @@
const uint8x16_t result =
vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
-
- vst1q_u8(temp, result);
- memcpy(dst_buffer + i, temp, remainder);
+ const uint8x16_t v_remainder = vdupq_n_u8(remainder);
+ // Create over write mask.
+ const uint8x16_t mask = vcleq_u8(v_remainder, v_index);
+ const uint8x16_t dst_remainder = vbslq_u8(mask, src_2, result);
+ vst1q_u8(dst_buffer + i, dst_remainder);
}
dst_buffer[1] = special_vals[0];
@@ -284,13 +293,225 @@
}
} // namespace
+} // namespace low_bitdepth
-void IntraEdgeInit_NEON() { Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+const uint16_t kRemainderMask[8][8] = {
+ {0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000},
+ {0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000},
+};
+
+void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
+ assert(strength == 1 || strength == 2 || strength == 3);
+ const int kernel_index = strength - 1;
+ auto* const dst_buffer = static_cast<uint16_t*>(buffer);
+
+ // The first element is not written out (but it is input) so the number of
+ // elements written is |size| - 1.
+ if (size == 1) return;
+
+ // |strength| 1 and 2 use a 3 tap filter.
+ if (strength < 3) {
+ // The last value requires extending the buffer (duplicating
+ // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
+ // neon.
+ const uint16_t last_val = RightShiftWithRounding(
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
+ kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
+ kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
+ 4);
+
+ const uint16_t krn0 = kKernelsNEON[kernel_index][0];
+ const uint16_t krn1 = kKernelsNEON[kernel_index][1];
+
+ // The first value we need gets overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ // Process blocks until there are less than 16 values remaining.
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_0| will read past the
+ // end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ // Load the next row before overwriting. This loads an extra 7 values
+ // past |size| on the trailing iteration.
+ src_0 = vld1q_u16(dst_buffer + i + 7);
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ // The last output value |last_val| was already calculated so if
+ // |remainder| == 1 then we don't have to do anything.
+ const int remainder = (size - 1) & 0x7;
+ if (remainder > 1) {
+ const uint16x8_t src_1 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t sum_02 = vmulq_n_u16(vaddq_u16(src_0, src_2), krn0);
+ const uint16x8_t sum = vmlaq_n_u16(sum_02, src_1, krn1);
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_1);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[size - 1] = last_val;
+ return;
+ }
+
+ assert(strength == 3);
+ // 5 tap filter. The first element requires duplicating |buffer[0]| and the
+ // last two elements require duplicating |buffer[size - 1]|.
+ uint16_t special_vals[3];
+ special_vals[0] = RightShiftWithRounding(
+ (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
+ (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
+ 4);
+ // Clamp index for very small |size| values.
+ const int first_index_min = std::max(size - 4, 0);
+ const int second_index_min = std::max(size - 3, 0);
+ const int third_index_min = std::max(size - 2, 0);
+ special_vals[1] = RightShiftWithRounding(
+ (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
+ (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
+ (dst_buffer[size - 1] << 1),
+ 4);
+ special_vals[2] = RightShiftWithRounding(
+ (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
+ // x << 2 + x << 2 == x << 3
+ (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
+ 4);
+
+ // The first two values we need get overwritten by the output from the
+ // previous iteration.
+ uint16x8_t src_0 = vld1q_u16(dst_buffer - 1);
+ uint16x8_t src_1 = vld1q_u16(dst_buffer);
+ int i = 1;
+
+ for (; i < size - 7; i += 8) {
+ // Loading these at the end of the block with |src_[01]| will read past
+ // the end of |top_row_data[160]|, the source of |buffer|.
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+
+ // Load the next before overwriting.
+ src_0 = vld1q_u16(dst_buffer + i + 6);
+ src_1 = vld1q_u16(dst_buffer + i + 7);
+
+ vst1q_u16(dst_buffer + i, result);
+ }
+
+ const int remainder = (size - 1) & 0x7;
+ // Like the 3 tap but if there are two remaining values we have already
+ // calculated them.
+ if (remainder > 2) {
+ const uint16x8_t src_2 = vld1q_u16(dst_buffer + i);
+ const uint16x8_t src_3 = vld1q_u16(dst_buffer + i + 1);
+ const uint16x8_t src_4 = vld1q_u16(dst_buffer + i + 2);
+ const uint16x8_t sum_04 = vshlq_n_u16(vaddq_u16(src_0, src_4), 1);
+ const uint16x8_t sum_123 = vaddq_u16(vaddq_u16(src_1, src_2), src_3);
+ const uint16x8_t sum = vaddq_u16(sum_04, vshlq_n_u16(sum_123, 2));
+ const uint16x8_t result = vrshrq_n_u16(sum, 4);
+ const uint16x8_t mask = vld1q_u16(kRemainderMask[remainder]);
+ const uint16x8_t dst_remainder = vbslq_u16(mask, result, src_2);
+ vst1q_u16(dst_buffer + i, dst_remainder);
+ }
+
+ dst_buffer[1] = special_vals[0];
+ // Avoid overwriting |dst_buffer[0]|.
+ if (size > 2) dst_buffer[size - 2] = special_vals[1];
+ dst_buffer[size - 1] = special_vals[2];
+}
+
+void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
+ assert(size % 4 == 0 && size <= 16);
+ auto* const pixel_buffer = static_cast<uint16_t*>(buffer);
+
+ // Extend first/last samples
+ pixel_buffer[-2] = pixel_buffer[-1];
+ pixel_buffer[size] = pixel_buffer[size - 1];
+
+ const int16x8_t src_lo = vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2));
+ const int16x8_t src_hi =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer - 2 + 8));
+ const int16x8_t src9_hi = vaddq_s16(src_hi, vshlq_n_s16(src_hi, 3));
+ const int16x8_t src9_lo = vaddq_s16(src_lo, vshlq_n_s16(src_lo, 3));
+
+ int16x8_t sum_lo = vsubq_s16(vextq_s16(src9_lo, src9_hi, 1), src_lo);
+ sum_lo = vaddq_s16(sum_lo, vextq_s16(src9_lo, src9_hi, 2));
+ sum_lo = vsubq_s16(sum_lo, vextq_s16(src_lo, src_hi, 3));
+ sum_lo = vrshrq_n_s16(sum_lo, 4);
+
+ uint16x8x2_t result_lo;
+ result_lo.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_lo, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_lo.val[1] = vreinterpretq_u16_s16(vextq_s16(src_lo, src_hi, 2));
+
+ if (size > 8) {
+ const int16x8_t src_hi_extra =
+ vreinterpretq_s16_u16(vld1q_u16(pixel_buffer + 16 - 2));
+ const int16x8_t src9_hi_extra =
+ vaddq_s16(src_hi_extra, vshlq_n_s16(src_hi_extra, 3));
+
+ int16x8_t sum_hi = vsubq_s16(vextq_s16(src9_hi, src9_hi_extra, 1), src_hi);
+ sum_hi = vaddq_s16(sum_hi, vextq_s16(src9_hi, src9_hi_extra, 2));
+ sum_hi = vsubq_s16(sum_hi, vextq_s16(src_hi, src_hi_extra, 3));
+ sum_hi = vrshrq_n_s16(sum_hi, 4);
+
+ uint16x8x2_t result_hi;
+ result_hi.val[0] =
+ vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(sum_hi, vdupq_n_s16(0))),
+ vdupq_n_u16((1 << kBitdepth10) - 1));
+ result_hi.val[1] =
+ vreinterpretq_u16_s16(vextq_s16(src_hi, src_hi_extra, 2));
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ vst2q_u16(pixel_buffer + 15, result_hi);
+ } else {
+ vst2q_u16(pixel_buffer - 1, result_lo);
+ }
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->intra_edge_filter = IntraEdgeFilter_NEON;
+ dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraEdgeInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intra_edge_neon.h b/libgav1/src/dsp/arm/intra_edge_neon.h
index d3bb243..28e3494 100644
--- a/libgav1/src/dsp/arm/intra_edge_neon.h
+++ b/libgav1/src/dsp/arm/intra_edge_neon.h
@@ -34,6 +34,9 @@
#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeFilter LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_IntraEdgeUpsampler LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRA_EDGE_NEON_H_
diff --git a/libgav1/src/dsp/arm/intrapred_cfl_neon.cc b/libgav1/src/dsp/arm/intrapred_cfl_neon.cc
index 45fe33b..8d8748f 100644
--- a/libgav1/src/dsp/arm/intrapred_cfl_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_cfl_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -27,45 +27,20 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
-namespace low_bitdepth {
-namespace {
-
-uint8x16_t Set2ValuesQ(const uint8_t* a) {
- uint16_t combined_values = a[0] | a[1] << 8;
- return vreinterpretq_u8_u16(vdupq_n_u16(combined_values));
-}
-
-uint32_t SumVector(uint32x2_t a) {
-#if defined(__aarch64__)
- return vaddv_u32(a);
-#else
- const uint64x1_t b = vpaddl_u32(a);
- return vget_lane_u32(vreinterpret_u32_u64(b), 0);
-#endif // defined(__aarch64__)
-}
-
-uint32_t SumVector(uint32x4_t a) {
-#if defined(__aarch64__)
- return vaddvq_u32(a);
-#else
- const uint64x2_t b = vpaddlq_u32(a);
- const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
- return vget_lane_u32(vreinterpret_u32_u64(c), 0);
-#endif // defined(__aarch64__)
-}
// Divide by the number of elements.
-uint32_t Average(const uint32_t sum, const int width, const int height) {
+inline uint32_t Average(const uint32_t sum, const int width, const int height) {
return RightShiftWithRounding(sum, FloorLog2(width) + FloorLog2(height));
}
// Subtract |val| from every element in |a|.
-void BlockSubtract(const uint32_t val,
- int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
- const int width, const int height) {
+inline void BlockSubtract(const uint32_t val,
+ int16_t a[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int width, const int height) {
assert(val <= INT16_MAX);
const int16x8_t val_v = vdupq_n_s16(static_cast<int16_t>(val));
@@ -94,6 +69,9 @@
}
}
+namespace low_bitdepth {
+namespace {
+
template <int block_width, int block_height>
void CflSubsampler420_NEON(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -122,26 +100,27 @@
sum = SumVector(running_sum);
} else if (block_width == 8) {
- const uint8x16_t x_index = {0, 0, 2, 2, 4, 4, 6, 6,
- 8, 8, 10, 10, 12, 12, 14, 14};
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
+ const uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
+ const uint16x8_t x_max_index =
+ vdupq_n_u16(max_luma_width == 8 ? max_luma_width - 2 : 16);
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- const uint8x16_t x_max0 = Set2ValuesQ(src + max_luma_width - 2);
- const uint8x16_t x_max1 = Set2ValuesQ(src + max_luma_width - 2 + stride);
+ const uint8x16_t row0 = vld1q_u8(src);
+ const uint8x16_t row1 = vld1q_u8(src + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
- uint8x16_t row0 = vld1q_u8(src);
- row0 = vbslq_u8(x_mask, row0, x_max0);
- uint8x16_t row1 = vld1q_u8(src + stride);
- row1 = vbslq_u8(x_mask, row1, x_max1);
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_lane_u16(vget_low_u16(sum_row_shifted), 3);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y], vreinterpretq_s16_u16(final_sum_row));
- uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
- sum_row = vshlq_n_u16(sum_row, 1);
- running_sum = vpadalq_u16(running_sum, sum_row);
- vst1q_s16(luma[y], vreinterpretq_s16_u16(sum_row));
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
@@ -150,45 +129,35 @@
sum = SumVector(running_sum);
} else /* block_width >= 16 */ {
- const uint8x16_t x_max_index = vdupq_n_u8(max_luma_width - 2);
+ const uint16x8_t x_max_index = vdupq_n_u16(max_luma_width - 2);
uint32x4_t running_sum = vdupq_n_u32(0);
for (int y = 0; y < block_height; ++y) {
- uint8x16_t x_index = {0, 2, 4, 6, 8, 10, 12, 14,
- 16, 18, 20, 22, 24, 26, 28, 30};
- const uint8x16_t x_max00 = vdupq_n_u8(src[max_luma_width - 2]);
- const uint8x16_t x_max01 = vdupq_n_u8(src[max_luma_width - 2 + 1]);
- const uint8x16_t x_max10 = vdupq_n_u8(src[stride + max_luma_width - 2]);
- const uint8x16_t x_max11 =
- vdupq_n_u8(src[stride + max_luma_width - 2 + 1]);
- for (int x = 0; x < block_width; x += 16) {
- const ptrdiff_t src_x_offset = x << 1;
- const uint8x16_t x_mask = vcltq_u8(x_index, x_max_index);
- const uint8x16x2_t row0 = vld2q_u8(src + src_x_offset);
- const uint8x16x2_t row1 = vld2q_u8(src + src_x_offset + stride);
- const uint8x16_t row_masked_00 = vbslq_u8(x_mask, row0.val[0], x_max00);
- const uint8x16_t row_masked_01 = vbslq_u8(x_mask, row0.val[1], x_max01);
- const uint8x16_t row_masked_10 = vbslq_u8(x_mask, row1.val[0], x_max10);
- const uint8x16_t row_masked_11 = vbslq_u8(x_mask, row1.val[1], x_max11);
+ // Calculate the 2x2 sum at the max_luma offset
+ const uint8_t a00 = src[max_luma_width - 2];
+ const uint8_t a01 = src[max_luma_width - 1];
+ const uint8_t a10 = src[max_luma_width - 2 + stride];
+ const uint8_t a11 = src[max_luma_width - 1 + stride];
+ // Dup the 2x2 sum at the max luma offset.
+ const uint16x8_t max_luma_sum =
+ vdupq_n_u16((uint16_t)((a00 + a01 + a10 + a11) << 1));
+ uint16x8_t x_index = {0, 2, 4, 6, 8, 10, 12, 14};
- uint16x8_t sum_row_lo =
- vaddl_u8(vget_low_u8(row_masked_00), vget_low_u8(row_masked_01));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_10));
- sum_row_lo = vaddw_u8(sum_row_lo, vget_low_u8(row_masked_11));
- sum_row_lo = vshlq_n_u16(sum_row_lo, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_lo);
- vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(sum_row_lo));
+ ptrdiff_t src_x_offset = 0;
+ for (int x = 0; x < block_width; x += 8, src_x_offset += 16) {
+ const uint16x8_t x_mask = vcltq_u16(x_index, x_max_index);
+ const uint8x16_t row0 = vld1q_u8(src + src_x_offset);
+ const uint8x16_t row1 = vld1q_u8(src + src_x_offset + stride);
+ const uint16x8_t sum_row = vpadalq_u8(vpaddlq_u8(row0), row1);
+ const uint16x8_t sum_row_shifted = vshlq_n_u16(sum_row, 1);
+ const uint16x8_t final_sum_row =
+ vbslq_u16(x_mask, sum_row_shifted, max_luma_sum);
+ vst1q_s16(luma[y] + x, vreinterpretq_s16_u16(final_sum_row));
- uint16x8_t sum_row_hi =
- vaddl_u8(vget_high_u8(row_masked_00), vget_high_u8(row_masked_01));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_10));
- sum_row_hi = vaddw_u8(sum_row_hi, vget_high_u8(row_masked_11));
- sum_row_hi = vshlq_n_u16(sum_row_hi, 1);
- running_sum = vpadalq_u16(running_sum, sum_row_hi);
- vst1q_s16(luma[y] + x + 8, vreinterpretq_s16_u16(sum_row_hi));
-
- x_index = vaddq_u8(x_index, vdupq_n_u8(32));
+ running_sum = vpadalq_u16(running_sum, final_sum_row);
+ x_index = vaddq_u16(x_index, vdupq_n_u16(16));
}
+
if (y << 1 < max_luma_height - 2) {
src += stride << 1;
}
@@ -209,17 +178,30 @@
uint32_t sum;
if (block_width == 4) {
assert(max_luma_width >= 4);
+ assert(max_luma_height <= block_height);
+ assert((max_luma_height % 2) == 0);
uint32x4_t running_sum = vdupq_n_u32(0);
uint8x8_t row = vdup_n_u8(0);
- for (int y = 0; y < block_height; y += 2) {
+ uint16x8_t row_shifted;
+ int y = 0;
+ do {
row = Load4<0>(src, row);
row = Load4<1>(src + stride, row);
if (y < (max_luma_height - 1)) {
src += stride << 1;
}
- const uint16x8_t row_shifted = vshll_n_u8(row, 3);
+ row_shifted = vshll_n_u8(row, 3);
+ running_sum = vpadalq_u16(running_sum, row_shifted);
+ vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
+ vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
+ y += 2;
+ } while (y < max_luma_height);
+
+ row_shifted =
+ vcombine_u16(vget_high_u16(row_shifted), vget_high_u16(row_shifted));
+ for (; y < block_height; y += 2) {
running_sum = vpadalq_u16(running_sum, row_shifted);
vst1_s16(luma[y], vreinterpret_s16_u16(vget_low_u16(row_shifted)));
vst1_s16(luma[y + 1], vreinterpret_s16_u16(vget_high_u16(row_shifted)));
@@ -463,12 +445,874 @@
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflSubsampler
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+// This duplicates the last two 16-bit values in |row|.
+inline uint16x8_t LastRowSamples(const uint16x8_t row) {
+ const uint32x2_t a = vget_high_u32(vreinterpretq_u32_u16(row));
+ const uint32x4_t b = vdupq_lane_u32(a, 1);
+ return vreinterpretq_u16_u32(b);
+}
+
+// This duplicates the last unsigned 16-bit value in |row|.
+inline uint16x8_t LastRowResult(const uint16x8_t row) {
+ const uint16x4_t a = vget_high_u16(row);
+ const uint16x8_t b = vdupq_lane_u16(a, 0x3);
+ return b;
+}
+
+// This duplicates the last signed 16-bit value in |row|.
+inline int16x8_t LastRowResult(const int16x8_t row) {
+ const int16x4_t a = vget_high_s16(row);
+ const int16x8_t b = vdupq_lane_s16(a, 0x3);
+ return b;
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline uint16x8_t StoreLumaResults4_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(vget_low_u16(result_shifted)));
+ vst1_s16(luma_ptr + kCflLumaBufferStride,
+ vreinterpret_s16_u16(vget_high_u16(result_shifted)));
+ return result_shifted;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline uint16x8_t StoreLumaResults8_420(const uint16x8_t vertical_sum0,
+ const uint16x8_t vertical_sum1,
+ int16_t* luma_ptr) {
+ const uint16x8_t result = vpaddq_u16(vertical_sum0, vertical_sum1);
+ const uint16x8_t result_shifted = vshlq_n_u16(result, 1);
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(result_shifted));
+ return result_shifted;
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint16x4_t sum = vdup_n_u16(0);
+ uint16x4_t samples[2];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1_u16(src);
+ samples[1] = vld1_u16(src + src_stride);
+ src += src_stride << 1;
+ sum = vadd_u16(sum, samples[0]);
+ sum = vadd_u16(sum, samples[1]);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples[1] = vshl_n_u16(samples[1], 1);
+ do {
+ sum = vadd_u16(sum, samples[1]);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(vpaddl_u16(sum)), block_height_log2 - 1);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x4_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1_s16(ssrc);
+ ssample = vshl_n_s16(ssample, 3);
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1_s16(luma_ptr, vsub_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples;
+ int y = visible_height;
+
+ do {
+ samples = vld1q_u16(src);
+ src += src_stride;
+ sum = vpadalq_u16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = vpadalq_u16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ const uint32_t average_sum =
+ RightShiftWithRounding(SumVector(sum), block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssample;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ ssample = vld1q_s16(ssrc);
+ ssample = vshlq_n_s16(ssample, 3);
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ vst1q_s16(luma_ptr, vsubq_s16(ssample, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_NEON<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_NEON<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ uint32x4_t sum = vdupq_n_u32(0);
+ uint16x8_t samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = vld1q_u16(src);
+ samples[1] =
+ (max_luma_width >= 16) ? vld1q_u16(src + 8) : LastRowResult(samples[0]);
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? vld1q_u16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? vld1q_u16(src + 24)
+ : LastRowResult(samples[2]);
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ sum = vpadalq_u16(sum, inner_sum);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ uint16x8_t inner_sum = vaddq_u16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = vaddq_u16(samples[2], inner_sum);
+ inner_sum = vaddq_u16(samples[3], inner_sum);
+ }
+ do {
+ sum = vpadalq_u16(sum, inner_sum);
+ } while (++y < block_height);
+ }
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(sum), block_width_log2 + block_height_log2 - 3);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ const auto* ssrc = static_cast<const int16_t*>(source);
+ int16x8_t ssamples_ext = vdupq_n_s16(0);
+ int16x8_t ssamples[4];
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ ssamples[idx] = vld1q_s16(&ssrc[x]);
+ ssamples[idx] = vshlq_n_s16(ssamples[idx], 3);
+ ssamples_ext = ssamples[idx];
+ } else {
+ ssamples[idx] = LastRowResult(ssamples_ext);
+ }
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ ssrc += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ vst1q_s16(&luma_ptr[x], vsubq_s16(ssamples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_NEON<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row0 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row1 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row0, samples_row1);
+
+ const uint16x8_t samples_row2 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row3 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum23 = vaddq_u16(samples_row2, samples_row3);
+ uint16x8_t sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const uint16x8_t samples_row4 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row5 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum45 = vaddq_u16(samples_row4, samples_row5);
+
+ const uint16x8_t samples_row6 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t samples_row7 = vld1q_u16(src);
+ src += src_stride;
+ const uint16x8_t luma_sum67 = vaddq_u16(samples_row6, samples_row7);
+ sum =
+ vaddq_u16(sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ const uint16x4_t final_fill =
+ vreinterpret_u16_s16(vld1_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum = vmovl_u16(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ vst1_s16(luma_ptr, vreinterpret_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 2 /*log2 of width 4*/);
+ const int16x4_t averages = vdup_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x4_t samples = vld1_s16(luma_ptr);
+ vst1_s16(luma_ptr, vsub_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const uint16x8_t samples_row10 = vld1q_u16(src);
+ const uint16x8_t samples_row11 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const uint16x8_t luma_sum00 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum01 = vaddq_u16(samples_row01, samples_row11);
+ uint16x8_t sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row20 = vld1q_u16(src);
+ const uint16x8_t samples_row21 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const uint16x8_t samples_row30 = vld1q_u16(src);
+ const uint16x8_t samples_row31 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const uint16x8_t luma_sum10 = vaddq_u16(samples_row20, samples_row30);
+ const uint16x8_t luma_sum11 = vaddq_u16(samples_row21, samples_row31);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row40 = vld1q_u16(src);
+ const uint16x8_t samples_row41 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const uint16x8_t samples_row50 = vld1q_u16(src);
+ const uint16x8_t samples_row51 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const uint16x8_t luma_sum20 = vaddq_u16(samples_row40, samples_row50);
+ const uint16x8_t luma_sum21 = vaddq_u16(samples_row41, samples_row51);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const uint16x8_t samples_row60 = vld1q_u16(src);
+ const uint16x8_t samples_row61 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const uint16x8_t samples_row70 = vld1q_u16(src);
+ const uint16x8_t samples_row71 = (max_luma_width == 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const uint16x8_t luma_sum30 = vaddq_u16(samples_row60, samples_row70);
+ const uint16x8_t luma_sum31 = vaddq_u16(samples_row61, samples_row71);
+ sum =
+ vaddq_u16(sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = vpadalq_u16(final_sum, sum);
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const uint16x8_t final_fill =
+ vreinterpretq_u16_s16(vld1q_s16(luma_ptr - kCflLumaBufferStride));
+ const uint32x4_t final_fill_to_sum =
+ vaddl_u16(vget_low_u16(final_fill), vget_high_u16(final_fill));
+
+ for (y = luma_height; y < block_height; ++y) {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill));
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ }
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_height_log2 + 3 /*log2 of width 8*/);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_NEON<block_height_log2, 16>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ uint16x8_t final_fill0, final_fill1;
+ uint32x4_t final_sum = vdupq_n_u32(0);
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const uint16x8_t samples_row00 = vld1q_u16(src);
+ const uint16x8_t samples_row01 = (max_luma_width >= 16)
+ ? vld1q_u16(src + 8)
+ : LastRowSamples(samples_row00);
+ const uint16x8_t samples_row02 = (max_luma_width >= 24)
+ ? vld1q_u16(src + 16)
+ : LastRowSamples(samples_row01);
+ const uint16x8_t samples_row03 = (max_luma_width == 32)
+ ? vld1q_u16(src + 24)
+ : LastRowSamples(samples_row02);
+ const uint16x8_t samples_row10 = vld1q_u16(src_next);
+ const uint16x8_t samples_row11 = (max_luma_width >= 16)
+ ? vld1q_u16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const uint16x8_t samples_row12 = (max_luma_width >= 24)
+ ? vld1q_u16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const uint16x8_t samples_row13 = (max_luma_width == 32)
+ ? vld1q_u16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const uint16x8_t luma_sum0 = vaddq_u16(samples_row00, samples_row10);
+ const uint16x8_t luma_sum1 = vaddq_u16(samples_row01, samples_row11);
+ const uint16x8_t luma_sum2 = vaddq_u16(samples_row02, samples_row12);
+ const uint16x8_t luma_sum3 = vaddq_u16(samples_row03, samples_row13);
+ final_fill0 = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_fill1 = StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ const uint16x8_t sum = vaddq_u16(final_fill0, final_fill1);
+
+ final_sum = vpadalq_u16(final_sum, sum);
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const uint16x8_t wide_fill = LastRowResult(final_fill1);
+ final_sum = vpadalq_u16(final_sum, vshlq_n_u16(wide_fill, 1));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ uint32x4_t wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit. (a << 2) = (a + a) << 1.
+ wide_fill = vshll_n_u16(vget_low_u16(LastRowResult(final_fill1)), 2);
+ }
+ const uint16x8_t final_inner_sum = vaddq_u16(final_fill0, final_fill1);
+ const uint32x4_t final_fill_to_sum = vaddl_u16(
+ vget_low_u16(final_inner_sum), vget_high_u16(final_inner_sum));
+
+ do {
+ vst1q_s16(luma_ptr, vreinterpretq_s16_u16(final_fill0));
+ vst1q_s16(luma_ptr + 8, vreinterpretq_s16_u16(final_fill1));
+ if (block_width_log2 == 5) {
+ final_sum = vaddq_u32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = vaddq_u32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ const uint32_t average_sum = RightShiftWithRounding(
+ SumVector(final_sum), block_width_log2 + block_height_log2);
+ const int16x8_t averages = vdupq_n_s16(static_cast<int16_t>(average_sum));
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const int16x8_t samples0 = vld1q_s16(luma_ptr);
+ vst1q_s16(luma_ptr, vsubq_s16(samples0, averages));
+ const int16x8_t samples1 = vld1q_s16(luma_ptr + 8);
+ const int16x8_t final_row_result = vsubq_s16(samples1, averages);
+ vst1q_s16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const int16x8_t wide_fill = LastRowResult(final_row_result);
+ vst1q_s16(luma_ptr + 16, wide_fill);
+ vst1q_s16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+//------------------------------------------------------------------------------
+// Choose subsampler based on max_luma_width
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_NEON(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_NEON<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+// Clip |dc + ((alpha * luma) >> 6))| to 0, (1 << bitdepth) - 1.
+inline uint16x8_t Combine8(const int16x8_t luma, const int16x8_t alpha_abs,
+ const int16x8_t alpha_signed, const int16x8_t dc,
+ const uint16x8_t max_value) {
+ const int16x8_t luma_abs = vabsq_s16(luma);
+ const int16x8_t luma_alpha_sign =
+ vshrq_n_s16(veorq_s16(luma, alpha_signed), 15);
+ // (alpha * luma) >> 6
+ const int16x8_t la_abs = vqrdmulhq_s16(luma_abs, alpha_abs);
+ // Convert back to signed values.
+ const int16x8_t la =
+ vsubq_s16(veorq_s16(la_abs, luma_alpha_sign), luma_alpha_sign);
+ const int16x8_t result = vaddq_s16(la, dc);
+ const int16x8_t zero = vdupq_n_s16(0);
+ // Clip.
+ return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(result, zero)), max_value);
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor4xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; y += 2) {
+ const int16x4_t luma_row0 = vld1_s16(luma[y]);
+ const int16x4_t luma_row1 = vld1_s16(luma[y + 1]);
+ const int16x8_t combined_luma = vcombine_s16(luma_row0, luma_row1);
+ const uint16x8_t sum =
+ Combine8(combined_luma, alpha_abs, alpha_signed, dc, max_value);
+ vst1_u16(dst, vget_low_u16(sum));
+ dst += dst_stride;
+ vst1_u16(dst, vget_high_u16(sum));
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor8xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row = vld1q_s16(luma[y]);
+ const uint16x8_t sum =
+ Combine8(luma_row, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor16xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ dst += dst_stride;
+ }
+}
+
+template <int block_height, int bitdepth = 10>
+inline void CflIntraPredictor32xN_NEON(
+ void* const dest, const ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = stride >> 1;
+ const uint16x8_t max_value = vdupq_n_u16((1 << bitdepth) - 1);
+ const int16x8_t alpha_signed = vdupq_n_s16(alpha << 9);
+ const int16x8_t alpha_abs = vabsq_s16(alpha_signed);
+ const int16x8_t dc = vdupq_n_s16(dst[0]);
+ for (int y = 0; y < block_height; ++y) {
+ const int16x8_t luma_row_0 = vld1q_s16(luma[y]);
+ const int16x8_t luma_row_1 = vld1q_s16(luma[y] + 8);
+ const int16x8_t luma_row_2 = vld1q_s16(luma[y] + 16);
+ const int16x8_t luma_row_3 = vld1q_s16(luma[y] + 24);
+ const uint16x8_t sum_0 =
+ Combine8(luma_row_0, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_1 =
+ Combine8(luma_row_1, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_2 =
+ Combine8(luma_row_2, alpha_abs, alpha_signed, dc, max_value);
+ const uint16x8_t sum_3 =
+ Combine8(luma_row_3, alpha_abs, alpha_signed, dc, max_value);
+ vst1q_u16(dst, sum_0);
+ vst1q_u16(dst + 8, sum_1);
+ vst1q_u16(dst + 16, sum_2);
+ vst1q_u16(dst + 24, sum_3);
+ dst += dst_stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_NEON<5, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_NEON<4>;
+
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<2>;
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<3>;
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<4>;
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_NEON<5>;
+
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 2>;
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 3>;
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 4>;
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<4, 5>;
+
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 3>;
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 4>;
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_NEON<5, 5>;
+
+ dsp->cfl_intra_predictors[kTransformSize4x4] = CflIntraPredictor4xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize4x8] = CflIntraPredictor4xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize4x16] = CflIntraPredictor4xN_NEON<16>;
+
+ dsp->cfl_intra_predictors[kTransformSize8x4] = CflIntraPredictor8xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize8x8] = CflIntraPredictor8xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize8x16] = CflIntraPredictor8xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize8x32] = CflIntraPredictor8xN_NEON<32>;
+
+ dsp->cfl_intra_predictors[kTransformSize16x4] = CflIntraPredictor16xN_NEON<4>;
+ dsp->cfl_intra_predictors[kTransformSize16x8] = CflIntraPredictor16xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor16xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor16xN_NEON<32>;
+ dsp->cfl_intra_predictors[kTransformSize32x8] = CflIntraPredictor32xN_NEON<8>;
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor32xN_NEON<16>;
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor32xN_NEON<32>;
+ // Max Cfl predictor size is 32x32.
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intrapred_cfl_neon.h b/libgav1/src/dsp/arm/intrapred_cfl_neon.h
new file mode 100644
index 0000000..b4f983a
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_cfl_neon.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+// 4x4
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// -----------------------------------------------------------------------------
+// 10bpp
+
+// 4x4
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x8
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 4x16
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x4
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x8
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x16
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 8x32
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x4
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x8
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x16
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 16x32
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x8
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x16
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
+
+// 32x32
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_CFL_NEON_H_
diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.cc b/libgav1/src/dsp/arm/intrapred_directional_neon.cc
index 805ba81..3f5edbd 100644
--- a/libgav1/src/dsp/arm/intrapred_directional_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_directional_neon.cc
@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_directional.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
-#include <algorithm> // std::min
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
@@ -35,14 +35,14 @@
namespace low_bitdepth {
namespace {
-// Blend two values based on a 32 bit weight.
+// Blend two values based on weights that sum to 32.
inline uint8x8_t WeightedBlend(const uint8x8_t a, const uint8x8_t b,
const uint8x8_t a_weight,
const uint8x8_t b_weight) {
const uint16x8_t a_product = vmull_u8(a, a_weight);
const uint16x8_t b_product = vmull_u8(b, b_weight);
- return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5);
+ return vrshrn_n_u16(vaddq_u16(a_product, b_product), 5 /*log2(32)*/);
}
// For vertical operations the weights are one constant value.
@@ -112,7 +112,7 @@
// 4 wide subsamples the output. 8 wide subsamples the input.
if (width == 4) {
const uint8x8_t left_values = vld1_u8(top + top_base_x);
- const uint8x8_t right_values = RightShift<8>(left_values);
+ const uint8x8_t right_values = RightShiftVector<8>(left_values);
const uint8x8_t value = WeightedBlend(left_values, right_values, shift);
// If |upsampled| is true then extract every other value for output.
@@ -910,12 +910,590 @@
} // namespace
} // namespace low_bitdepth
-void IntraPredDirectionalInit_NEON() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Blend two values based on weights that sum to 32.
+inline uint16x4_t WeightedBlend(const uint16x4_t a, const uint16x4_t b,
+ const int a_weight, const int b_weight) {
+ const uint16x4_t a_product = vmul_n_u16(a, a_weight);
+ const uint16x4_t sum = vmla_n_u16(a_product, b, b_weight);
+
+ return vrshr_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Blend two values based on weights that sum to 32.
+inline uint16x8_t WeightedBlend(const uint16x8_t a, const uint16x8_t b,
+ const uint16_t a_weight,
+ const uint16_t b_weight) {
+ const uint16x8_t a_product = vmulq_n_u16(a, a_weight);
+ const uint16x8_t sum = vmlaq_n_u16(a_product, b, b_weight);
+
+ return vrshrq_n_u16(sum, 5 /*log2(32)*/);
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x4x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2_u16(source);
+ } else {
+ dest->val[0] = vld1_u16(source);
+ dest->val[1] = vld1_u16(source + 1);
+ }
+}
+
+// Each element of |dest| contains values associated with one weight value.
+inline void LoadEdgeVals(uint16x8x2_t* dest, const uint16_t* const source,
+ const bool upsampled) {
+ if (upsampled) {
+ *dest = vld2q_u16(source);
+ } else {
+ dest->val[0] = vld1q_u16(source);
+ dest->val[1] = vld1q_u16(source + 1);
+ }
+}
+
+template <bool upsampled>
+inline void DirectionalZone1_4xH(uint16_t* dst, const ptrdiff_t stride,
+ const int height, const uint16_t* const top,
+ const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_x = (4 + height - 1) << upsample_shift;
+ const int16x4_t max_base = vdup_n_s16(max_base_x);
+ const uint16x4_t final_top_val = vdup_n_u16(top[max_base_x]);
+ const int16x4_t index_offset = {0, 1, 2, 3};
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ const int16x4_t base_x = vadd_s16(vdup_n_s16(top_base_x), index_offset);
+ const uint16x4_t max_base_mask = vclt_s16(base_x, max_base);
+
+ uint16x4x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x4_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ // If |upsampled| is true then extract every other value for output.
+ const uint16x4_t masked_result =
+ vbsl_u16(max_base_mask, combined, final_top_val);
+
+ vst1_u16(dst, masked_result);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], 4 /* width */);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+template <bool upsampled>
+inline void DirectionalZone1_WxH(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ do {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+
+ base_x = vaddq_s16(base_x, block_step);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+// Process a multiple of 8 |width| by any |height|. Processes horizontally
+// before vertically in the hopes of being a little more cache friendly.
+inline void DirectionalZone1_Large(uint16_t* dst, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const top, const int xstep,
+ const bool upsampled) {
+ assert(width % 8 == 0);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ const int max_base_index = (width + height - 1) << upsample_shift;
+ const int16x8_t max_base_x = vdupq_n_s16(max_base_index);
+ const uint16x8_t final_top_val = vdupq_n_u16(top[max_base_index]);
+ const int16x8_t index_offset = {0, 1, 2, 3, 4, 5, 6, 7};
+
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+ const int16x8_t block_step = vdupq_n_s16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use Memset.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_index / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ ((max_base_index - (base_step * width)) << index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ int x = 0;
+ do {
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ vst1q_u16(dst + x, combined);
+
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ // To accommodate reuse of this function in Zone2, permit negative values
+ // for |xstep|.
+ const uint16_t shift_0 = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const uint16_t shift_1 = 32 - shift_0;
+
+ // Use signed values to compare |top_base_x| to |max_base_x|.
+ int16x8_t base_x = vaddq_s16(vdupq_n_s16(top_base_x), index_offset);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_index - top_base_x) >> upsample_shift) + 7) &
+ ~7;
+ for (; x < min_corner_only_x; x += 8, top_base_x += base_step8,
+ base_x = vaddq_s16(base_x, block_step)) {
+ const uint16x8_t max_base_mask = vcltq_s16(base_x, max_base_x);
+
+ uint16x8x2_t sampled_top_row;
+ LoadEdgeVals(&sampled_top_row, top + top_base_x, upsampled);
+ const uint16x8_t combined = WeightedBlend(
+ sampled_top_row.val[0], sampled_top_row.val[1], shift_1, shift_0);
+
+ const uint16x8_t masked_result =
+ vbslq_u16(max_base_mask, combined, final_top_val);
+ vst1q_u16(dst + x, masked_result);
+ }
+ // Corner-only section of the row.
+ Memset(dst + x, top[max_base_index], width - x);
+ }
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_index], width);
+ dst += stride;
+ }
+}
+
+void DirectionalIntraPredictorZone1_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const uint16_t* const top = static_cast<const uint16_t*>(top_row);
+ uint16_t* dst = static_cast<uint16_t*>(dest);
+ stride /= sizeof(top[0]);
+
+ assert(xstep > 0);
+
+ if (xstep == 64) {
+ assert(!upsampled_top);
+ const uint16_t* top_ptr = top + 1;
+ const int width_bytes = width * sizeof(top[0]);
+ int y = height;
+ do {
+ memcpy(dst, top_ptr, width_bytes);
+ memcpy(dst + stride, top_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, top_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, top_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ top_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ } else {
+ if (width == 4) {
+ if (upsampled_top) {
+ DirectionalZone1_4xH<true>(dst, stride, height, top, xstep);
+ } else {
+ DirectionalZone1_4xH<false>(dst, stride, height, top, xstep);
+ }
+ } else if (width >= 32) {
+ if (upsampled_top) {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, true);
+ } else {
+ DirectionalZone1_Large(dst, stride, width, height, top, xstep, false);
+ }
+ } else if (upsampled_top) {
+ DirectionalZone1_WxH<true>(dst, stride, width, height, top, xstep);
+ } else {
+ DirectionalZone1_WxH<false>(dst, stride, width, height, top, xstep);
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// Zone 3
+// This can be considered "the transpose of Zone 1." In Zone 1, the fractional
+// step applies when moving vertically in the destination block, connected to
+// the change in |y|, whereas in this mode, the step applies when moving
+// horizontally, connected to the change in |x|. This makes vectorization very
+// complicated in row-order, because a given vector may need source pixels that
+// span 16 or 32 pixels in steep angles, requiring multiple expensive table
+// lookups and checked loads. Rather than work in row order, it is simpler to
+// compute |dest| in column order, and then store the transposed results.
+
+// Compute 4x4 sub-blocks.
+// Example of computed sub-blocks of a 4x8 block before and after transpose:
+// 00 10 20 30 00 01 02 03
+// 01 11 21 31 10 11 12 13
+// 02 12 22 32 20 21 22 23
+// 03 13 23 33 30 31 32 33
+// ----------- --> -----------
+// 40 50 60 70 40 41 42 43
+// 41 51 61 71 50 51 52 53
+// 42 52 62 72 60 61 62 63
+// 43 53 63 73 70 71 72 73
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dst, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x4_t result[4];
+
+ int left_y = base_left_y + ystep;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ uint16x4x2_t sampled_left_col;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose4x4(result);
+ Store4(dst, result[0]);
+ dst += stride;
+ Store4(dst, result[1]);
+ dst += stride;
+ Store4(dst, result[2]);
+ dst += stride;
+ Store4(dst, result[3]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4xH(uint8_t* dest, const ptrdiff_t stride,
+ const int height, const uint16_t* const left,
+ const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<upsampled>(dest, stride, left + (y << upsample_shift),
+ ystep);
+ dest += 4 * stride;
+ y += 4;
+ } while (y < height);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_Wx4(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const uint16_t* const left,
+ const int ystep) {
+ int x = 0;
+ int base_left_y = 0;
+ do {
+ // TODO(petersonab): Establish 8x4 transpose to reserve this function for
+ // 8x4 and 16x4.
+ DirectionalZone3_4x4<upsampled>(dest + 2 * x, stride, left, ystep,
+ base_left_y);
+ base_left_y += 4 * ystep;
+ x += 4;
+ } while (x < width);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_8x8(uint8_t* dest, const ptrdiff_t stride,
+ const uint16_t* const left, const int ystep,
+ const int base_left_y = 0) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+
+ // Compute one column at a time, then transpose for storage.
+ uint16x8_t result[8];
+
+ int left_y = base_left_y + ystep;
+ uint16x8x2_t sampled_left_col;
+ int left_offset = left_y >> index_scale_bits;
+ int shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ int shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[0] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[1] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[2] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[3] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[4] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[5] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[6] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ left_y += ystep;
+ left_offset = left_y >> index_scale_bits;
+ shift_0 = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ shift_1 = 32 - shift_0;
+ LoadEdgeVals(&sampled_left_col, &left[left_offset], upsampled);
+ result[7] = WeightedBlend(sampled_left_col.val[0], sampled_left_col.val[1],
+ shift_1, shift_0);
+
+ Transpose8x8(result);
+ Store8(dest, result[0]);
+ dest += stride;
+ Store8(dest, result[1]);
+ dest += stride;
+ Store8(dest, result[2]);
+ dest += stride;
+ Store8(dest, result[3]);
+ dest += stride;
+ Store8(dest, result[4]);
+ dest += stride;
+ Store8(dest, result[5]);
+ dest += stride;
+ Store8(dest, result[6]);
+ dest += stride;
+ Store8(dest, result[7]);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_WxH(uint8_t* dest, const ptrdiff_t stride,
+ const int width, const int height,
+ const uint16_t* const left, const int ystep) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> (6 - upsample_shift)) +
+ (/* base_step */ 1 << upsample_shift) *
+ (height - 1)); // left_base_y
+ int y = 0;
+ do {
+ int x = 0;
+ uint8_t* dst_x = dest + y * stride;
+ do {
+ const int base_left_y = ystep * x;
+ DirectionalZone3_8x8<upsampled>(
+ dst_x, stride, left + (y << upsample_shift), ystep, base_left_y);
+ dst_x += 8 * sizeof(uint16_t);
+ x += 8;
+ } while (x < width);
+ y += 8;
+ } while (y < height);
+}
+
+void DirectionalIntraPredictorZone3_NEON(void* const dest,
+ const ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const uint16_t* const left = static_cast<const uint16_t*>(left_column);
+ uint8_t* dst = static_cast<uint8_t*>(dest);
+
+ if (ystep == 64) {
+ assert(!upsampled_left);
+ const int width_bytes = width * sizeof(left[0]);
+ int y = height;
+ do {
+ const uint16_t* left_ptr = left + 1;
+ memcpy(dst, left_ptr, width_bytes);
+ memcpy(dst + stride, left_ptr + 1, width_bytes);
+ memcpy(dst + 2 * stride, left_ptr + 2, width_bytes);
+ memcpy(dst + 3 * stride, left_ptr + 3, width_bytes);
+ dst += 4 * stride;
+ left_ptr += 4;
+ y -= 4;
+ } while (y != 0);
+ return;
+ }
+ if (width == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_4xH<true>(dst, stride, height, left, ystep);
+ } else {
+ DirectionalZone3_4xH<false>(dst, stride, height, left, ystep);
+ }
+ } else if (height == 4) {
+ if (upsampled_left) {
+ DirectionalZone3_Wx4<true>(dst, stride, width, left, ystep);
+ } else {
+ DirectionalZone3_Wx4<false>(dst, stride, width, left, ystep);
+ }
+ } else {
+ if (upsampled_left) {
+ // |upsampled_left| can only be true if |width| + |height| <= 16,
+ // therefore this is 8x8.
+ DirectionalZone3_8x8<true>(dst, stride, left, ystep);
+ } else {
+ DirectionalZone3_WxH<false>(dst, stride, width, height, left, ystep);
+ }
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->directional_intra_predictor_zone1 = DirectionalIntraPredictorZone1_NEON;
+ dsp->directional_intra_predictor_zone3 = DirectionalIntraPredictorZone3_NEON;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intrapred_directional_neon.h b/libgav1/src/dsp/arm/intrapred_directional_neon.h
new file mode 100644
index 0000000..f7d6235
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_directional_neon.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
+#endif
+
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_DIRECTIONAL_NEON_H_
diff --git a/libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc b/libgav1/src/dsp/arm/intrapred_filter_neon.cc
similarity index 96%
rename from libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc
rename to libgav1/src/dsp/arm/intrapred_filter_neon.cc
index 411708e..bd9f61d 100644
--- a/libgav1/src/dsp/arm/intrapred_filter_intra_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_filter_neon.cc
@@ -1,4 +1,4 @@
-// Copyright 2019 The libgav1 Authors
+// Copyright 2021 The libgav1 Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_filter.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -160,16 +160,16 @@
} // namespace
} // namespace low_bitdepth
-void IntraPredFilterIntraInit_NEON() { low_bitdepth::Init8bpp(); }
+void IntraPredFilterInit_NEON() { low_bitdepth::Init8bpp(); }
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
-void IntraPredFilterIntraInit_NEON() {}
+void IntraPredFilterInit_NEON() {}
} // namespace dsp
} // namespace libgav1
diff --git a/libgav1/src/dsp/arm/intrapred_filter_neon.h b/libgav1/src/dsp/arm/intrapred_filter_neon.h
new file mode 100644
index 0000000..283c1b1
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_filter_neon.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_FILTER_NEON_H_
diff --git a/libgav1/src/dsp/arm/intrapred_neon.cc b/libgav1/src/dsp/arm/intrapred_neon.cc
index c967d82..c143648 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_neon.cc
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -964,6 +965,200 @@
using _64x64 = DcPredFuncs_NEON<6, 6, DcSum_NEON, DcStore_NEON<64, 64>>;
};
+// IntraPredFuncs_NEON::Horizontal -- duplicate left column across all rows
+
+template <int block_height>
+void Horizontal4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x4_t row = vld1_dup_u16(left + y);
+ vst1_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ const uint16x8_t row = vld1q_dup_u16(left + y);
+ vst1q_u16(dst16, row);
+ dst += stride;
+ } while (++y < block_height);
+}
+
+template <int block_height>
+void Horizontal16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+template <int block_height>
+void Horizontal32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* /*top_row*/,
+ const void* const left_column) {
+ const auto* const left = static_cast<const uint16_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = 0;
+ do {
+ const uint16x8_t row0 = vld1q_dup_u16(left + y);
+ const uint16x8_t row1 = vld1q_dup_u16(left + y + 1);
+ auto* dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row0);
+ vst1q_u16(dst16 + 8, row0);
+ vst1q_u16(dst16 + 16, row0);
+ vst1q_u16(dst16 + 24, row0);
+ dst += stride;
+ dst16 = reinterpret_cast<uint16_t*>(dst);
+ vst1q_u16(dst16, row1);
+ vst1q_u16(dst16 + 8, row1);
+ vst1q_u16(dst16 + 16, row1);
+ vst1q_u16(dst16 + 24, row1);
+ dst += stride;
+ y += 2;
+ } while (y < block_height);
+}
+
+// IntraPredFuncs_NEON::Vertical -- copy top row to all rows
+
+template <int block_height>
+void Vertical4xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x8_t row = vld1_u8(top);
+ int y = block_height;
+ do {
+ vst1_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical8xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row = vld1q_u8(top);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row);
+ dst += stride;
+ } while (--y != 0);
+}
+
+template <int block_height>
+void Vertical16xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical32xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int block_height>
+void Vertical64xH_NEON(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const /*left_column*/) {
+ const auto* const top = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const uint8x16_t row0 = vld1q_u8(top);
+ const uint8x16_t row1 = vld1q_u8(top + 16);
+ const uint8x16_t row2 = vld1q_u8(top + 32);
+ const uint8x16_t row3 = vld1q_u8(top + 48);
+ const uint8x16_t row4 = vld1q_u8(top + 64);
+ const uint8x16_t row5 = vld1q_u8(top + 80);
+ const uint8x16_t row6 = vld1q_u8(top + 96);
+ const uint8x16_t row7 = vld1q_u8(top + 112);
+ int y = block_height;
+ do {
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ vst1q_u8(dst, row0);
+ vst1q_u8(dst + 16, row1);
+ vst1q_u8(dst + 32, row2);
+ vst1q_u8(dst + 48, row3);
+ vst1q_u8(dst + 64, row4);
+ vst1q_u8(dst + 80, row5);
+ vst1q_u8(dst + 96, row6);
+ vst1q_u8(dst + 112, row7);
+ dst += stride;
+ y -= 2;
+ } while (y != 0);
+}
+
void Init10bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
assert(dsp != nullptr);
@@ -973,6 +1168,8 @@
DcDefs::_4x4::DcLeft;
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDc] =
DcDefs::_4x4::Dc;
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorVertical] =
+ Vertical4xH_NEON<4>;
// 4x8
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcTop] =
@@ -981,6 +1178,10 @@
DcDefs::_4x8::DcLeft;
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDc] =
DcDefs::_4x8::Dc;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorVertical] =
+ Vertical4xH_NEON<8>;
// 4x16
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcTop] =
@@ -989,6 +1190,10 @@
DcDefs::_4x16::DcLeft;
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDc] =
DcDefs::_4x16::Dc;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorHorizontal] =
+ Horizontal4xH_NEON<16>;
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorVertical] =
+ Vertical4xH_NEON<16>;
// 8x4
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcTop] =
@@ -997,6 +1202,8 @@
DcDefs::_8x4::DcLeft;
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDc] =
DcDefs::_8x4::Dc;
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorVertical] =
+ Vertical8xH_NEON<4>;
// 8x8
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcTop] =
@@ -1005,6 +1212,10 @@
DcDefs::_8x8::DcLeft;
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDc] =
DcDefs::_8x8::Dc;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorVertical] =
+ Vertical8xH_NEON<8>;
// 8x16
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcTop] =
@@ -1013,6 +1224,8 @@
DcDefs::_8x16::DcLeft;
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDc] =
DcDefs::_8x16::Dc;
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorVertical] =
+ Vertical8xH_NEON<16>;
// 8x32
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcTop] =
@@ -1021,6 +1234,10 @@
DcDefs::_8x32::DcLeft;
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDc] =
DcDefs::_8x32::Dc;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorHorizontal] =
+ Horizontal8xH_NEON<32>;
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorVertical] =
+ Vertical8xH_NEON<32>;
// 16x4
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcTop] =
@@ -1029,6 +1246,8 @@
DcDefs::_16x4::DcLeft;
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDc] =
DcDefs::_16x4::Dc;
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorVertical] =
+ Vertical16xH_NEON<4>;
// 16x8
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcTop] =
@@ -1037,6 +1256,10 @@
DcDefs::_16x8::DcLeft;
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDc] =
DcDefs::_16x8::Dc;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorHorizontal] =
+ Horizontal16xH_NEON<8>;
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorVertical] =
+ Vertical16xH_NEON<8>;
// 16x16
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcTop] =
@@ -1045,6 +1268,8 @@
DcDefs::_16x16::DcLeft;
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDc] =
DcDefs::_16x16::Dc;
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorVertical] =
+ Vertical16xH_NEON<16>;
// 16x32
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcTop] =
@@ -1053,6 +1278,8 @@
DcDefs::_16x32::DcLeft;
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDc] =
DcDefs::_16x32::Dc;
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorVertical] =
+ Vertical16xH_NEON<32>;
// 16x64
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcTop] =
@@ -1061,6 +1288,8 @@
DcDefs::_16x64::DcLeft;
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDc] =
DcDefs::_16x64::Dc;
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorVertical] =
+ Vertical16xH_NEON<64>;
// 32x8
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcTop] =
@@ -1069,6 +1298,8 @@
DcDefs::_32x8::DcLeft;
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDc] =
DcDefs::_32x8::Dc;
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorVertical] =
+ Vertical32xH_NEON<8>;
// 32x16
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcTop] =
@@ -1077,6 +1308,8 @@
DcDefs::_32x16::DcLeft;
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDc] =
DcDefs::_32x16::Dc;
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorVertical] =
+ Vertical32xH_NEON<16>;
// 32x32
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcTop] =
@@ -1085,6 +1318,8 @@
DcDefs::_32x32::DcLeft;
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDc] =
DcDefs::_32x32::Dc;
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorVertical] =
+ Vertical32xH_NEON<32>;
// 32x64
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcTop] =
@@ -1093,6 +1328,10 @@
DcDefs::_32x64::DcLeft;
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDc] =
DcDefs::_32x64::Dc;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorHorizontal] =
+ Horizontal32xH_NEON<64>;
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorVertical] =
+ Vertical32xH_NEON<64>;
// 64x16
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcTop] =
@@ -1101,6 +1340,8 @@
DcDefs::_64x16::DcLeft;
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDc] =
DcDefs::_64x16::Dc;
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorVertical] =
+ Vertical64xH_NEON<16>;
// 64x32
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcTop] =
@@ -1109,6 +1350,8 @@
DcDefs::_64x32::DcLeft;
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDc] =
DcDefs::_64x32::Dc;
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorVertical] =
+ Vertical64xH_NEON<32>;
// 64x64
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcTop] =
@@ -1117,6 +1360,8 @@
DcDefs::_64x64::DcLeft;
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDc] =
DcDefs::_64x64::Dc;
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorVertical] =
+ Vertical64xH_NEON<64>;
}
} // namespace
@@ -1133,7 +1378,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intrapred_neon.h b/libgav1/src/dsp/arm/intrapred_neon.h
index 16f858c..b27f29f 100644
--- a/libgav1/src/dsp/arm/intrapred_neon.h
+++ b/libgav1/src/dsp/arm/intrapred_neon.h
@@ -23,396 +23,282 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
-void IntraPredCflInit_NEON();
-void IntraPredDirectionalInit_NEON();
-void IntraPredFilterIntraInit_NEON();
+// Initializes Dsp::intra_predictors.
+// See the defines below for specifics. These functions are not thread-safe.
void IntraPredInit_NEON();
-void IntraPredSmoothInit_NEON();
} // namespace dsp
} // namespace libgav1
#if LIBGAV1_ENABLE_NEON
-// 8 bit
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_NEON
-
// 4x4
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
-
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorPaeth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_NEON
// 10 bit
// 4x4
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x8
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 4x16
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x4
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x8
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x16
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 8x32
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x4
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x8
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x16
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x32
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 16x64
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x8
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcLeft LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x16
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x32
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 32x64
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorHorizontal \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x16
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x32
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
// 64x64
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcTop LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcLeft \
LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDc LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorVertical \
+ LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_NEON_H_
diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
index abc93e8..c33f333 100644
--- a/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
+++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/utils/cpu.h"
#if LIBGAV1_ENABLE_NEON
@@ -26,6 +26,7 @@
#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -605,7 +606,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/intrapred_smooth_neon.h b/libgav1/src/dsp/arm/intrapred_smooth_neon.h
new file mode 100644
index 0000000..edd01be
--- /dev/null
+++ b/libgav1/src/dsp/arm/intrapred_smooth_neon.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+#define LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_NEON();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_ENABLE_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_NEON
+#endif // LIBGAV1_ENABLE_NEON
+
+#endif // LIBGAV1_SRC_DSP_ARM_INTRAPRED_SMOOTH_NEON_H_
diff --git a/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
new file mode 100644
index 0000000..ff184a1
--- /dev/null
+++ b/libgav1/src/dsp/arm/inverse_transform_10bit_neon.cc
@@ -0,0 +1,2543 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/inverse_transform.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
+
+#include <arm_neon.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "src/dsp/arm/common_neon.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/array_2d.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+// Include the constants and utility functions inside the anonymous namespace.
+#include "src/dsp/inverse_transform.inc"
+
+//------------------------------------------------------------------------------
+
+LIBGAV1_ALWAYS_INLINE void Transpose4x4(const int32x4_t in[4],
+ int32x4_t out[4]) {
+ // in:
+ // 00 01 02 03
+ // 10 11 12 13
+ // 20 21 22 23
+ // 30 31 32 33
+
+ // 00 10 02 12 a.val[0]
+ // 01 11 03 13 a.val[1]
+ // 20 30 22 32 b.val[0]
+ // 21 31 23 33 b.val[1]
+ const int32x4x2_t a = vtrnq_s32(in[0], in[1]);
+ const int32x4x2_t b = vtrnq_s32(in[2], in[3]);
+ out[0] = vextq_s32(vextq_s32(a.val[0], a.val[0], 2), b.val[0], 2);
+ out[1] = vextq_s32(vextq_s32(a.val[1], a.val[1], 2), b.val[1], 2);
+ out[2] = vextq_s32(a.val[0], vextq_s32(b.val[0], b.val[0], 2), 2);
+ out[3] = vextq_s32(a.val[1], vextq_s32(b.val[1], b.val[1], 2), 2);
+ // out:
+ // 00 10 20 30
+ // 01 11 21 31
+ // 02 12 22 32
+ // 03 13 23 33
+}
+
+//------------------------------------------------------------------------------
+template <int store_count>
+LIBGAV1_ALWAYS_INLINE void StoreDst(int32_t* dst, int32_t stride, int32_t idx,
+ const int32x4_t* const s) {
+ assert(store_count % 4 == 0);
+ for (int i = 0; i < store_count; i += 4) {
+ vst1q_s32(&dst[i * stride + idx], s[i]);
+ vst1q_s32(&dst[(i + 1) * stride + idx], s[i + 1]);
+ vst1q_s32(&dst[(i + 2) * stride + idx], s[i + 2]);
+ vst1q_s32(&dst[(i + 3) * stride + idx], s[i + 3]);
+ }
+}
+
+template <int load_count>
+LIBGAV1_ALWAYS_INLINE void LoadSrc(const int32_t* src, int32_t stride,
+ int32_t idx, int32x4_t* x) {
+ assert(load_count % 4 == 0);
+ for (int i = 0; i < load_count; i += 4) {
+ x[i] = vld1q_s32(&src[i * stride + idx]);
+ x[i + 1] = vld1q_s32(&src[(i + 1) * stride + idx]);
+ x[i + 2] = vld1q_s32(&src[(i + 2) * stride + idx]);
+ x[i + 3] = vld1q_s32(&src[(i + 3) * stride + idx]);
+ }
+}
+
+// Butterfly rotate 4 values.
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_4(int32x4_t* a, int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t acc_x = vmulq_n_s32(*a, cos128);
+ const int32x4_t acc_y = vmulq_n_s32(*a, sin128);
+ // The max range for the input is 18 bits. The cos128/sin128 is 13 bits,
+ // which leaves 1 bit for the add/subtract. For 10bpp, x/y will fit in a 32
+ // bit lane.
+ const int32x4_t x0 = vmlsq_n_s32(acc_x, *b, sin128);
+ const int32x4_t y0 = vmlaq_n_s32(acc_y, *b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_FirstIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ assert(sin128 <= 0xfff);
+ const int32x4_t x0 = vmulq_n_s32(*b, -sin128);
+ const int32x4_t y0 = vmulq_n_s32(*b, cos128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void ButterflyRotation_SecondIsZero(int32x4_t* a,
+ int32x4_t* b,
+ const int angle,
+ const bool flip) {
+ const int32_t cos128 = Cos128(angle);
+ const int32_t sin128 = Sin128(angle);
+ const int32x4_t x0 = vmulq_n_s32(*a, cos128);
+ const int32x4_t y0 = vmulq_n_s32(*a, sin128);
+ const int32x4_t x = vrshrq_n_s32(x0, 12);
+ const int32x4_t y = vrshrq_n_s32(y0, 12);
+ if (flip) {
+ *a = y;
+ *b = x;
+ } else {
+ *a = x;
+ *b = y;
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = x;
+ *b = y;
+}
+
+LIBGAV1_ALWAYS_INLINE void HadamardRotation(int32x4_t* a, int32x4_t* b,
+ bool flip, const int32x4_t* min,
+ const int32x4_t* max) {
+ int32x4_t x, y;
+ if (flip) {
+ y = vqaddq_s32(*b, *a);
+ x = vqsubq_s32(*b, *a);
+ } else {
+ x = vqaddq_s32(*a, *b);
+ y = vqsubq_s32(*a, *b);
+ }
+ *a = vmaxq_s32(vminq_s32(x, *max), *min);
+ *b = vmaxq_s32(vminq_s32(y, *max), *min);
+}
+
+using ButterflyRotationFunc = void (*)(int32x4_t* a, int32x4_t* b, int angle,
+ bool flip);
+
+//------------------------------------------------------------------------------
+// Discrete Cosine Transforms (DCT).
+
+template <int width>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t s0 = vbslq_s32(v_mask, v_src_round, v_src);
+ const int32_t cos128 = Cos128(32);
+ const int32x4_t xy = vqrdmulhq_n_s32(s0, cos128 << (31 - 12));
+ // vqrshlq_s32 will shift right if shift value is negative.
+ const int32x4_t xy_shifted = vqrshlq_s32(xy, vdupq_n_s32(-row_shift));
+ // Clamp result to signed 16 bits.
+ const int32x4_t result = vmovl_s16(vqmovn_s32(xy_shifted));
+ if (width == 4) {
+ vst1q_s32(dst, result);
+ } else {
+ for (int i = 0; i < width; i += 4) {
+ vst1q_s32(dst, result);
+ dst += 4;
+ }
+ }
+ return true;
+}
+
+template <int height>
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32_t cos128 = Cos128(32);
+
+ // Calculate dc values for first row.
+ if (width == 4) {
+ const int32x4_t v_src = vld1q_s32(dst);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(dst, xy);
+ } else {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+ const int32x4_t xy = vqrdmulhq_n_s32(v_src, cos128 << (31 - 12));
+ vst1q_s32(&dst[i], xy);
+ i += 4;
+ } while (i < width);
+ }
+
+ // Copy first row to the rest of the block.
+ for (int y = 1; y < height; ++y) {
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
+ }
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct4Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 12.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
+ ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
+ } else {
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
+ }
+
+ // stage 17.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[3], false);
+ HadamardRotation(&s[1], &s[2], false);
+ } else {
+ HadamardRotation(&s[0], &s[3], false, min, max);
+ HadamardRotation(&s[1], &s[2], false, min, max);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ // When |is_row| is true, set range to the row range, otherwise, set to the
+ // column range.
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[4], x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 2, 1, 3
+ s[0] = x[0];
+ s[1] = x[2];
+ s[2] = x[1];
+ s[3] = x[3];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 4; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(s, s);
+ }
+ StoreDst<4>(dst, step, 0, s);
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct8Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 8.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
+ ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
+ } else {
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
+ }
+
+ // stage 13.
+ HadamardRotation(&s[4], &s[5], false, min, max);
+ HadamardRotation(&s[6], &s[7], true, min, max);
+
+ // stage 18.
+ butterfly_rotation(&s[6], &s[5], 32, true);
+
+ // stage 22.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[7], false);
+ HadamardRotation(&s[1], &s[6], false);
+ HadamardRotation(&s[2], &s[5], false);
+ HadamardRotation(&s[3], &s[4], false);
+ } else {
+ HadamardRotation(&s[0], &s[7], false, min, max);
+ HadamardRotation(&s[1], &s[6], false, min, max);
+ HadamardRotation(&s[2], &s[5], false, min, max);
+ HadamardRotation(&s[3], &s[4], false, min, max);
+ }
+}
+
+// Process dct8 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ // kBitReverseLookup 0, 4, 2, 6, 1, 5, 3, 7,
+ s[0] = x[0];
+ s[1] = x[4];
+ s[2] = x[2];
+ s[3] = x[6];
+ s[4] = x[1];
+ s[5] = x[5];
+ s[6] = x[3];
+ s[7] = x[7];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ Transpose4x4(&s[0], &s[0]);
+ Transpose4x4(&s[4], &s[4]);
+ StoreDst<4>(dst, step, 0, &s[0]);
+ StoreDst<4>(dst, step, 4, &s[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct16Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 5.
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
+ ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
+ ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
+ ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
+ } else {
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
+ }
+
+ // stage 9.
+ HadamardRotation(&s[8], &s[9], false, min, max);
+ HadamardRotation(&s[10], &s[11], true, min, max);
+ HadamardRotation(&s[12], &s[13], false, min, max);
+ HadamardRotation(&s[14], &s[15], true, min, max);
+
+ // stage 14.
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
+
+ // stage 19.
+ HadamardRotation(&s[8], &s[11], false, min, max);
+ HadamardRotation(&s[9], &s[10], false, min, max);
+ HadamardRotation(&s[12], &s[15], true, min, max);
+ HadamardRotation(&s[13], &s[14], true, min, max);
+
+ // stage 23.
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
+
+ // stage 26.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[15], false);
+ HadamardRotation(&s[1], &s[14], false);
+ HadamardRotation(&s[2], &s[13], false);
+ HadamardRotation(&s[3], &s[12], false);
+ HadamardRotation(&s[4], &s[11], false);
+ HadamardRotation(&s[5], &s[10], false);
+ HadamardRotation(&s[6], &s[9], false);
+ HadamardRotation(&s[7], &s[8], false);
+ } else {
+ HadamardRotation(&s[0], &s[15], false, min, max);
+ HadamardRotation(&s[1], &s[14], false, min, max);
+ HadamardRotation(&s[2], &s[13], false, min, max);
+ HadamardRotation(&s[3], &s[12], false, min, max);
+ HadamardRotation(&s[4], &s[11], false, min, max);
+ HadamardRotation(&s[5], &s[10], false, min, max);
+ HadamardRotation(&s[6], &s[9], false, min, max);
+ HadamardRotation(&s[7], &s[8], false, min, max);
+ }
+}
+
+// Process dct16 rows or columns, depending on the |is_row| flag.
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
+ s[0] = x[0];
+ s[1] = x[8];
+ s[2] = x[4];
+ s[3] = x[12];
+ s[4] = x[2];
+ s[5] = x[10];
+ s[6] = x[6];
+ s[7] = x[14];
+ s[8] = x[1];
+ s[9] = x[9];
+ s[10] = x[5];
+ s[11] = x[13];
+ s[12] = x[3];
+ s[13] = x[11];
+ s[14] = x[7];
+ s[15] = x[15];
+
+ Dct4Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<butterfly_rotation>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ s[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(s[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&s[idx], &s[idx]);
+ Transpose4x4(&s[idx + 4], &s[idx + 4]);
+ StoreDst<4>(dst, step, idx, &s[idx]);
+ StoreDst<4>(dst, step, idx + 4, &s[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &s[0]);
+ }
+}
+
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
+LIBGAV1_ALWAYS_INLINE void Dct32Stages(int32x4_t* s, const int32x4_t* min,
+ const int32x4_t* max,
+ const bool is_last_stage) {
+ // stage 3
+ if (is_fast_butterfly) {
+ ButterflyRotation_SecondIsZero(&s[16], &s[31], 62, false);
+ ButterflyRotation_FirstIsZero(&s[17], &s[30], 30, false);
+ ButterflyRotation_SecondIsZero(&s[18], &s[29], 46, false);
+ ButterflyRotation_FirstIsZero(&s[19], &s[28], 14, false);
+ ButterflyRotation_SecondIsZero(&s[20], &s[27], 54, false);
+ ButterflyRotation_FirstIsZero(&s[21], &s[26], 22, false);
+ ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
+ ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
+ } else {
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
+ }
+
+ // stage 6.
+ HadamardRotation(&s[16], &s[17], false, min, max);
+ HadamardRotation(&s[18], &s[19], true, min, max);
+ HadamardRotation(&s[20], &s[21], false, min, max);
+ HadamardRotation(&s[22], &s[23], true, min, max);
+ HadamardRotation(&s[24], &s[25], false, min, max);
+ HadamardRotation(&s[26], &s[27], true, min, max);
+ HadamardRotation(&s[28], &s[29], false, min, max);
+ HadamardRotation(&s[30], &s[31], true, min, max);
+
+ // stage 10.
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
+
+ // stage 15.
+ HadamardRotation(&s[16], &s[19], false, min, max);
+ HadamardRotation(&s[17], &s[18], false, min, max);
+ HadamardRotation(&s[20], &s[23], true, min, max);
+ HadamardRotation(&s[21], &s[22], true, min, max);
+ HadamardRotation(&s[24], &s[27], false, min, max);
+ HadamardRotation(&s[25], &s[26], false, min, max);
+ HadamardRotation(&s[28], &s[31], true, min, max);
+ HadamardRotation(&s[29], &s[30], true, min, max);
+
+ // stage 20.
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
+
+ // stage 24.
+ HadamardRotation(&s[16], &s[23], false, min, max);
+ HadamardRotation(&s[17], &s[22], false, min, max);
+ HadamardRotation(&s[18], &s[21], false, min, max);
+ HadamardRotation(&s[19], &s[20], false, min, max);
+ HadamardRotation(&s[24], &s[31], true, min, max);
+ HadamardRotation(&s[25], &s[30], true, min, max);
+ HadamardRotation(&s[26], &s[29], true, min, max);
+ HadamardRotation(&s[27], &s[28], true, min, max);
+
+ // stage 27.
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
+
+ // stage 29.
+ if (is_last_stage) {
+ HadamardRotation(&s[0], &s[31], false);
+ HadamardRotation(&s[1], &s[30], false);
+ HadamardRotation(&s[2], &s[29], false);
+ HadamardRotation(&s[3], &s[28], false);
+ HadamardRotation(&s[4], &s[27], false);
+ HadamardRotation(&s[5], &s[26], false);
+ HadamardRotation(&s[6], &s[25], false);
+ HadamardRotation(&s[7], &s[24], false);
+ HadamardRotation(&s[8], &s[23], false);
+ HadamardRotation(&s[9], &s[22], false);
+ HadamardRotation(&s[10], &s[21], false);
+ HadamardRotation(&s[11], &s[20], false);
+ HadamardRotation(&s[12], &s[19], false);
+ HadamardRotation(&s[13], &s[18], false);
+ HadamardRotation(&s[14], &s[17], false);
+ HadamardRotation(&s[15], &s[16], false);
+ } else {
+ HadamardRotation(&s[0], &s[31], false, min, max);
+ HadamardRotation(&s[1], &s[30], false, min, max);
+ HadamardRotation(&s[2], &s[29], false, min, max);
+ HadamardRotation(&s[3], &s[28], false, min, max);
+ HadamardRotation(&s[4], &s[27], false, min, max);
+ HadamardRotation(&s[5], &s[26], false, min, max);
+ HadamardRotation(&s[6], &s[25], false, min, max);
+ HadamardRotation(&s[7], &s[24], false, min, max);
+ HadamardRotation(&s[8], &s[23], false, min, max);
+ HadamardRotation(&s[9], &s[22], false, min, max);
+ HadamardRotation(&s[10], &s[21], false, min, max);
+ HadamardRotation(&s[11], &s[20], false, min, max);
+ HadamardRotation(&s[12], &s[19], false, min, max);
+ HadamardRotation(&s[13], &s[18], false, min, max);
+ HadamardRotation(&s[14], &s[17], false, min, max);
+ HadamardRotation(&s[15], &s[16], false, min, max);
+ }
+}
+
+// Process dct32 rows or columns, depending on the |is_row| flag.
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[32], x[32];
+
+ if (is_row) {
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
+ s[0] = x[0];
+ s[1] = x[16];
+ s[2] = x[8];
+ s[3] = x[24];
+ s[4] = x[4];
+ s[5] = x[20];
+ s[6] = x[12];
+ s[7] = x[28];
+ s[8] = x[2];
+ s[9] = x[18];
+ s[10] = x[10];
+ s[11] = x[26];
+ s[12] = x[6];
+ s[13] = x[22];
+ s[14] = x[14];
+ s[15] = x[30];
+
+ // 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
+ s[16] = x[1];
+ s[17] = x[17];
+ s[18] = x[9];
+ s[19] = x[25];
+ s[20] = x[5];
+ s[21] = x[21];
+ s[22] = x[13];
+ s[23] = x[29];
+ s[24] = x[3];
+ s[25] = x[19];
+ s[26] = x[11];
+ s[27] = x[27];
+ s[28] = x[7];
+ s[29] = x[23];
+ s[30] = x[15];
+ s[31] = x[31];
+
+ Dct4Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4>(s, &min, &max, /*is_last_stage=*/true);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 32; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<32>(dst, step, 0, &s[0]);
+ }
+}
+
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[64], x[32];
+
+ if (is_row) {
+ // The last 32 values of every row are always zero if the |tx_width| is
+ // 64.
+ for (int idx = 0; idx < 32; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ // The last 32 values of every column are always zero if the |tx_height| is
+ // 64.
+ LoadSrc<32>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1
+ // kBitReverseLookup
+ // 0, 32, 16, 48, 8, 40, 24, 56, 4, 36, 20, 52, 12, 44, 28, 60,
+ s[0] = x[0];
+ s[2] = x[16];
+ s[4] = x[8];
+ s[6] = x[24];
+ s[8] = x[4];
+ s[10] = x[20];
+ s[12] = x[12];
+ s[14] = x[28];
+
+ // 2, 34, 18, 50, 10, 42, 26, 58, 6, 38, 22, 54, 14, 46, 30, 62,
+ s[16] = x[2];
+ s[18] = x[18];
+ s[20] = x[10];
+ s[22] = x[26];
+ s[24] = x[6];
+ s[26] = x[22];
+ s[28] = x[14];
+ s[30] = x[30];
+
+ // 1, 33, 17, 49, 9, 41, 25, 57, 5, 37, 21, 53, 13, 45, 29, 61,
+ s[32] = x[1];
+ s[34] = x[17];
+ s[36] = x[9];
+ s[38] = x[25];
+ s[40] = x[5];
+ s[42] = x[21];
+ s[44] = x[13];
+ s[46] = x[29];
+
+ // 3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63
+ s[48] = x[3];
+ s[50] = x[19];
+ s[52] = x[11];
+ s[54] = x[27];
+ s[56] = x[7];
+ s[58] = x[23];
+ s[60] = x[15];
+ s[62] = x[31];
+
+ Dct4Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct8Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct16Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+ Dct32Stages<ButterflyRotation_4, /*is_fast_butterfly=*/true>(
+ s, &min, &max, /*is_last_stage=*/false);
+
+ //-- start dct 64 stages
+ // stage 2.
+ ButterflyRotation_SecondIsZero(&s[32], &s[63], 63 - 0, false);
+ ButterflyRotation_FirstIsZero(&s[33], &s[62], 63 - 32, false);
+ ButterflyRotation_SecondIsZero(&s[34], &s[61], 63 - 16, false);
+ ButterflyRotation_FirstIsZero(&s[35], &s[60], 63 - 48, false);
+ ButterflyRotation_SecondIsZero(&s[36], &s[59], 63 - 8, false);
+ ButterflyRotation_FirstIsZero(&s[37], &s[58], 63 - 40, false);
+ ButterflyRotation_SecondIsZero(&s[38], &s[57], 63 - 24, false);
+ ButterflyRotation_FirstIsZero(&s[39], &s[56], 63 - 56, false);
+ ButterflyRotation_SecondIsZero(&s[40], &s[55], 63 - 4, false);
+ ButterflyRotation_FirstIsZero(&s[41], &s[54], 63 - 36, false);
+ ButterflyRotation_SecondIsZero(&s[42], &s[53], 63 - 20, false);
+ ButterflyRotation_FirstIsZero(&s[43], &s[52], 63 - 52, false);
+ ButterflyRotation_SecondIsZero(&s[44], &s[51], 63 - 12, false);
+ ButterflyRotation_FirstIsZero(&s[45], &s[50], 63 - 44, false);
+ ButterflyRotation_SecondIsZero(&s[46], &s[49], 63 - 28, false);
+ ButterflyRotation_FirstIsZero(&s[47], &s[48], 63 - 60, false);
+
+ // stage 4.
+ HadamardRotation(&s[32], &s[33], false, &min, &max);
+ HadamardRotation(&s[34], &s[35], true, &min, &max);
+ HadamardRotation(&s[36], &s[37], false, &min, &max);
+ HadamardRotation(&s[38], &s[39], true, &min, &max);
+ HadamardRotation(&s[40], &s[41], false, &min, &max);
+ HadamardRotation(&s[42], &s[43], true, &min, &max);
+ HadamardRotation(&s[44], &s[45], false, &min, &max);
+ HadamardRotation(&s[46], &s[47], true, &min, &max);
+ HadamardRotation(&s[48], &s[49], false, &min, &max);
+ HadamardRotation(&s[50], &s[51], true, &min, &max);
+ HadamardRotation(&s[52], &s[53], false, &min, &max);
+ HadamardRotation(&s[54], &s[55], true, &min, &max);
+ HadamardRotation(&s[56], &s[57], false, &min, &max);
+ HadamardRotation(&s[58], &s[59], true, &min, &max);
+ HadamardRotation(&s[60], &s[61], false, &min, &max);
+ HadamardRotation(&s[62], &s[63], true, &min, &max);
+
+ // stage 7.
+ ButterflyRotation_4(&s[62], &s[33], 60 - 0, true);
+ ButterflyRotation_4(&s[61], &s[34], 60 - 0 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 60 - 32, true);
+ ButterflyRotation_4(&s[57], &s[38], 60 - 32 + 64, true);
+ ButterflyRotation_4(&s[54], &s[41], 60 - 16, true);
+ ButterflyRotation_4(&s[53], &s[42], 60 - 16 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 60 - 48, true);
+ ButterflyRotation_4(&s[49], &s[46], 60 - 48 + 64, true);
+
+ // stage 11.
+ HadamardRotation(&s[32], &s[35], false, &min, &max);
+ HadamardRotation(&s[33], &s[34], false, &min, &max);
+ HadamardRotation(&s[36], &s[39], true, &min, &max);
+ HadamardRotation(&s[37], &s[38], true, &min, &max);
+ HadamardRotation(&s[40], &s[43], false, &min, &max);
+ HadamardRotation(&s[41], &s[42], false, &min, &max);
+ HadamardRotation(&s[44], &s[47], true, &min, &max);
+ HadamardRotation(&s[45], &s[46], true, &min, &max);
+ HadamardRotation(&s[48], &s[51], false, &min, &max);
+ HadamardRotation(&s[49], &s[50], false, &min, &max);
+ HadamardRotation(&s[52], &s[55], true, &min, &max);
+ HadamardRotation(&s[53], &s[54], true, &min, &max);
+ HadamardRotation(&s[56], &s[59], false, &min, &max);
+ HadamardRotation(&s[57], &s[58], false, &min, &max);
+ HadamardRotation(&s[60], &s[63], true, &min, &max);
+ HadamardRotation(&s[61], &s[62], true, &min, &max);
+
+ // stage 16.
+ ButterflyRotation_4(&s[61], &s[34], 56, true);
+ ButterflyRotation_4(&s[60], &s[35], 56, true);
+ ButterflyRotation_4(&s[59], &s[36], 56 + 64, true);
+ ButterflyRotation_4(&s[58], &s[37], 56 + 64, true);
+ ButterflyRotation_4(&s[53], &s[42], 56 - 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 56 - 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 56 - 32 + 64, true);
+ ButterflyRotation_4(&s[50], &s[45], 56 - 32 + 64, true);
+
+ // stage 21.
+ HadamardRotation(&s[32], &s[39], false, &min, &max);
+ HadamardRotation(&s[33], &s[38], false, &min, &max);
+ HadamardRotation(&s[34], &s[37], false, &min, &max);
+ HadamardRotation(&s[35], &s[36], false, &min, &max);
+ HadamardRotation(&s[40], &s[47], true, &min, &max);
+ HadamardRotation(&s[41], &s[46], true, &min, &max);
+ HadamardRotation(&s[42], &s[45], true, &min, &max);
+ HadamardRotation(&s[43], &s[44], true, &min, &max);
+ HadamardRotation(&s[48], &s[55], false, &min, &max);
+ HadamardRotation(&s[49], &s[54], false, &min, &max);
+ HadamardRotation(&s[50], &s[53], false, &min, &max);
+ HadamardRotation(&s[51], &s[52], false, &min, &max);
+ HadamardRotation(&s[56], &s[63], true, &min, &max);
+ HadamardRotation(&s[57], &s[62], true, &min, &max);
+ HadamardRotation(&s[58], &s[61], true, &min, &max);
+ HadamardRotation(&s[59], &s[60], true, &min, &max);
+
+ // stage 25.
+ ButterflyRotation_4(&s[59], &s[36], 48, true);
+ ButterflyRotation_4(&s[58], &s[37], 48, true);
+ ButterflyRotation_4(&s[57], &s[38], 48, true);
+ ButterflyRotation_4(&s[56], &s[39], 48, true);
+ ButterflyRotation_4(&s[55], &s[40], 112, true);
+ ButterflyRotation_4(&s[54], &s[41], 112, true);
+ ButterflyRotation_4(&s[53], &s[42], 112, true);
+ ButterflyRotation_4(&s[52], &s[43], 112, true);
+
+ // stage 28.
+ HadamardRotation(&s[32], &s[47], false, &min, &max);
+ HadamardRotation(&s[33], &s[46], false, &min, &max);
+ HadamardRotation(&s[34], &s[45], false, &min, &max);
+ HadamardRotation(&s[35], &s[44], false, &min, &max);
+ HadamardRotation(&s[36], &s[43], false, &min, &max);
+ HadamardRotation(&s[37], &s[42], false, &min, &max);
+ HadamardRotation(&s[38], &s[41], false, &min, &max);
+ HadamardRotation(&s[39], &s[40], false, &min, &max);
+ HadamardRotation(&s[48], &s[63], true, &min, &max);
+ HadamardRotation(&s[49], &s[62], true, &min, &max);
+ HadamardRotation(&s[50], &s[61], true, &min, &max);
+ HadamardRotation(&s[51], &s[60], true, &min, &max);
+ HadamardRotation(&s[52], &s[59], true, &min, &max);
+ HadamardRotation(&s[53], &s[58], true, &min, &max);
+ HadamardRotation(&s[54], &s[57], true, &min, &max);
+ HadamardRotation(&s[55], &s[56], true, &min, &max);
+
+ // stage 30.
+ ButterflyRotation_4(&s[55], &s[40], 32, true);
+ ButterflyRotation_4(&s[54], &s[41], 32, true);
+ ButterflyRotation_4(&s[53], &s[42], 32, true);
+ ButterflyRotation_4(&s[52], &s[43], 32, true);
+ ButterflyRotation_4(&s[51], &s[44], 32, true);
+ ButterflyRotation_4(&s[50], &s[45], 32, true);
+ ButterflyRotation_4(&s[49], &s[46], 32, true);
+ ButterflyRotation_4(&s[48], &s[47], 32, true);
+
+ // stage 31.
+ for (int i = 0; i < 32; i += 4) {
+ HadamardRotation(&s[i], &s[63 - i], false, &min, &max);
+ HadamardRotation(&s[i + 1], &s[63 - i - 1], false, &min, &max);
+ HadamardRotation(&s[i + 2], &s[63 - i - 2], false, &min, &max);
+ HadamardRotation(&s[i + 3], &s[63 - i - 3], false, &min, &max);
+ }
+ //-- end dct 64 stages
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int idx = 0; idx < 64; idx += 8) {
+ int32x4_t output[8];
+ Transpose4x4(&s[idx], &output[0]);
+ Transpose4x4(&s[idx + 4], &output[4]);
+ for (int i = 0; i < 8; ++i) {
+ output[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(output[i], v_row_shift)));
+ }
+ StoreDst<4>(dst, step, idx, &output[0]);
+ StoreDst<4>(dst, step, idx + 4, &output[4]);
+ }
+ } else {
+ StoreDst<64>(dst, step, 0, &s[0]);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Asymmetric Discrete Sine Transforms (ADST).
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+ int32x4_t x[4];
+
+ LoadSrc<4>(dst, step, 0, x);
+ if (is_row) {
+ Transpose4x4(x, x);
+ }
+
+ // stage 1.
+ s[5] = vmulq_n_s32(x[3], kAdst4Multiplier[1]);
+ s[6] = vmulq_n_s32(x[3], kAdst4Multiplier[3]);
+
+ // stage 2.
+ const int32x4_t a7 = vsubq_s32(x[0], x[2]);
+ const int32x4_t b7 = vaddq_s32(a7, x[3]);
+
+ // stage 3.
+ s[0] = vmulq_n_s32(x[0], kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(x[0], kAdst4Multiplier[1]);
+ // s[0] = s[0] + s[3]
+ s[0] = vmlaq_n_s32(s[0], x[2], kAdst4Multiplier[3]);
+ // s[1] = s[1] - s[4]
+ s[1] = vmlsq_n_s32(s[1], x[2], kAdst4Multiplier[0]);
+
+ s[3] = vmulq_n_s32(x[1], kAdst4Multiplier[2]);
+ s[2] = vmulq_n_s32(b7, kAdst4Multiplier[2]);
+
+ // stage 4.
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[6]);
+
+ // stages 5 and 6.
+ const int32x4_t x0 = vaddq_s32(s[0], s[3]);
+ const int32x4_t x1 = vaddq_s32(s[1], s[3]);
+ const int32x4_t x3_a = vaddq_s32(s[0], s[1]);
+ const int32x4_t x3 = vsubq_s32(x3_a, s[3]);
+ x[0] = vrshrq_n_s32(x0, 12);
+ x[1] = vrshrq_n_s32(x1, 12);
+ x[2] = vrshrq_n_s32(s[2], 12);
+ x[3] = vrshrq_n_s32(x3, 12);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ x[0] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[0], v_row_shift)));
+ x[1] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[1], v_row_shift)));
+ x[2] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[2], v_row_shift)));
+ x[3] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[3], v_row_shift)));
+ Transpose4x4(x, x);
+ }
+ StoreDst<4>(dst, step, 0, x);
+}
+
+alignas(16) constexpr int32_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
+ 2482};
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[2];
+
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src0_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src0_round, v_src0);
+ const int32x4_t kAdst4DcOnlyMultipliers = vld1q_s32(kAdst4DcOnlyMultiplier);
+ s[1] = vdupq_n_s32(0);
+
+ // s0*k0 s0*k1 s0*k2 s0*k1
+ s[0] = vmulq_s32(kAdst4DcOnlyMultipliers, v_src);
+ // 0 0 0 s0*k0
+ s[1] = vextq_s32(s[1], s[0], 1);
+
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x3, 12);
+
+ // vqrshlq_s32 will shift right if shift value is negative.
+ vst1q_s32(dst,
+ vmovl_s16(vqmovn_s32(vqrshlq_s32(dst_0, vdupq_n_s32(-row_shift)))));
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[4];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&dst[i]);
+
+ s[0] = vmulq_n_s32(v_src, kAdst4Multiplier[0]);
+ s[1] = vmulq_n_s32(v_src, kAdst4Multiplier[1]);
+ s[2] = vmulq_n_s32(v_src, kAdst4Multiplier[2]);
+
+ const int32x4_t x0 = s[0];
+ const int32x4_t x1 = s[1];
+ const int32x4_t x2 = s[2];
+ const int32x4_t x3 = vaddq_s32(s[0], s[1]);
+ const int32x4_t dst_0 = vrshrq_n_s32(x0, 12);
+ const int32x4_t dst_1 = vrshrq_n_s32(x1, 12);
+ const int32x4_t dst_2 = vrshrq_n_s32(x2, 12);
+ const int32x4_t dst_3 = vrshrq_n_s32(x3, 12);
+
+ vst1q_s32(&dst[i], dst_0);
+ vst1q_s32(&dst[i + width * 1], dst_1);
+ vst1q_s32(&dst[i + width * 2], dst_2);
+ vst1q_s32(&dst[i + width * 3], dst_3);
+
+ i += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[8], x[8];
+
+ if (is_row) {
+ LoadSrc<4>(dst, step, 0, &x[0]);
+ LoadSrc<4>(dst, step, 4, &x[4]);
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ } else {
+ LoadSrc<8>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[7];
+ s[1] = x[0];
+ s[2] = x[5];
+ s[3] = x[2];
+ s[4] = x[3];
+ s[5] = x[4];
+ s[6] = x[1];
+ s[7] = x[6];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 8; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ Transpose4x4(&x[0], &x[0]);
+ Transpose4x4(&x[4], &x[4]);
+ StoreDst<4>(dst, step, 0, &x[0]);
+ StoreDst<4>(dst, step, 4, &x[4]);
+ } else {
+ StoreDst<8>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int i = 0; i < 8; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[8];
+
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 60, true);
+
+ // stage 3.
+ s[4] = s[0];
+ s[5] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+
+ // stage 5.
+ s[2] = s[0];
+ s[3] = s[1];
+ s[6] = s[4];
+ s[7] = s[5];
+
+ // stage 6.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+
+ // stage 7.
+ int32x4_t x[8];
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[4]);
+ x[2] = s[6];
+ x[3] = vqnegq_s32(s[2]);
+ x[4] = s[3];
+ x[5] = vqnegq_s32(s[7]);
+ x[6] = s[5];
+ x[7] = vqnegq_s32(s[1]);
+
+ for (int j = 0; j < 8; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+template <ButterflyRotationFunc butterfly_rotation>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
+ int row_shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32_t range = is_row ? kBitdepth10 + 7 : 15;
+ const int32x4_t min = vdupq_n_s32(-(1 << range));
+ const int32x4_t max = vdupq_n_s32((1 << range) - 1);
+ int32x4_t s[16], x[16];
+
+ if (is_row) {
+ for (int idx = 0; idx < 16; idx += 8) {
+ LoadSrc<4>(dst, step, idx, &x[idx]);
+ LoadSrc<4>(dst, step, idx + 4, &x[idx + 4]);
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ }
+ } else {
+ LoadSrc<16>(dst, step, 0, &x[0]);
+ }
+
+ // stage 1.
+ s[0] = x[15];
+ s[1] = x[0];
+ s[2] = x[13];
+ s[3] = x[2];
+ s[4] = x[11];
+ s[5] = x[4];
+ s[6] = x[9];
+ s[7] = x[6];
+ s[8] = x[7];
+ s[9] = x[8];
+ s[10] = x[5];
+ s[11] = x[10];
+ s[12] = x[3];
+ s[13] = x[12];
+ s[14] = x[1];
+ s[15] = x[14];
+
+ // stage 2.
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
+
+ // stage 3.
+ HadamardRotation(&s[0], &s[8], false, &min, &max);
+ HadamardRotation(&s[1], &s[9], false, &min, &max);
+ HadamardRotation(&s[2], &s[10], false, &min, &max);
+ HadamardRotation(&s[3], &s[11], false, &min, &max);
+ HadamardRotation(&s[4], &s[12], false, &min, &max);
+ HadamardRotation(&s[5], &s[13], false, &min, &max);
+ HadamardRotation(&s[6], &s[14], false, &min, &max);
+ HadamardRotation(&s[7], &s[15], false, &min, &max);
+
+ // stage 4.
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
+
+ // stage 5.
+ HadamardRotation(&s[0], &s[4], false, &min, &max);
+ HadamardRotation(&s[8], &s[12], false, &min, &max);
+ HadamardRotation(&s[1], &s[5], false, &min, &max);
+ HadamardRotation(&s[9], &s[13], false, &min, &max);
+ HadamardRotation(&s[2], &s[6], false, &min, &max);
+ HadamardRotation(&s[10], &s[14], false, &min, &max);
+ HadamardRotation(&s[3], &s[7], false, &min, &max);
+ HadamardRotation(&s[11], &s[15], false, &min, &max);
+
+ // stage 6.
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
+
+ // stage 7.
+ HadamardRotation(&s[0], &s[2], false, &min, &max);
+ HadamardRotation(&s[4], &s[6], false, &min, &max);
+ HadamardRotation(&s[8], &s[10], false, &min, &max);
+ HadamardRotation(&s[12], &s[14], false, &min, &max);
+ HadamardRotation(&s[1], &s[3], false, &min, &max);
+ HadamardRotation(&s[5], &s[7], false, &min, &max);
+ HadamardRotation(&s[9], &s[11], false, &min, &max);
+ HadamardRotation(&s[13], &s[15], false, &min, &max);
+
+ // stage 8.
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+
+ if (is_row) {
+ const int32x4_t v_row_shift = vdupq_n_s32(-row_shift);
+ for (int i = 0; i < 16; ++i) {
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], v_row_shift)));
+ }
+ for (int idx = 0; idx < 16; idx += 8) {
+ Transpose4x4(&x[idx], &x[idx]);
+ Transpose4x4(&x[idx + 4], &x[idx + 4]);
+ StoreDst<4>(dst, step, idx, &x[idx]);
+ StoreDst<4>(dst, step, idx + 4, &x[idx + 4]);
+ }
+ } else {
+ StoreDst<16>(dst, step, 0, &x[0]);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Adst16DcOnlyInternal(int32x4_t* s, int32x4_t* x) {
+ // stage 2.
+ ButterflyRotation_FirstIsZero(&s[0], &s[1], 62, true);
+
+ // stage 3.
+ s[8] = s[0];
+ s[9] = s[1];
+
+ // stage 4.
+ ButterflyRotation_4(&s[8], &s[9], 56, true);
+
+ // stage 5.
+ s[4] = s[0];
+ s[12] = s[8];
+ s[5] = s[1];
+ s[13] = s[9];
+
+ // stage 6.
+ ButterflyRotation_4(&s[4], &s[5], 48, true);
+ ButterflyRotation_4(&s[12], &s[13], 48, true);
+
+ // stage 7.
+ s[2] = s[0];
+ s[6] = s[4];
+ s[10] = s[8];
+ s[14] = s[12];
+ s[3] = s[1];
+ s[7] = s[5];
+ s[11] = s[9];
+ s[15] = s[13];
+
+ // stage 8.
+ ButterflyRotation_4(&s[2], &s[3], 32, true);
+ ButterflyRotation_4(&s[6], &s[7], 32, true);
+ ButterflyRotation_4(&s[10], &s[11], 32, true);
+ ButterflyRotation_4(&s[14], &s[15], 32, true);
+
+ // stage 9.
+ x[0] = s[0];
+ x[1] = vqnegq_s32(s[8]);
+ x[2] = s[12];
+ x[3] = vqnegq_s32(s[4]);
+ x[4] = s[6];
+ x[5] = vqnegq_s32(s[14]);
+ x[6] = s[10];
+ x[7] = vqnegq_s32(s[2]);
+ x[8] = s[3];
+ x[9] = vqnegq_s32(s[11]);
+ x[10] = s[15];
+ x[11] = vqnegq_s32(s[7]);
+ x[12] = s[5];
+ x[13] = vqnegq_s32(s[13]);
+ x[14] = s[9];
+ x[15] = vqnegq_s32(s[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src, kTransformRowMultiplier << (31 - 12));
+ // stage 1.
+ s[1] = vbslq_s32(v_mask, v_src_round, v_src);
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int i = 0; i < 16; ++i) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ x[i] = vmovl_s16(vqmovn_s32(vqrshlq_s32(x[i], vdupq_n_s32(-row_shift))));
+ vst1q_lane_s32(&dst[i], x[i], 0);
+ }
+
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
+ int width) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ int i = 0;
+ do {
+ int32x4_t s[16];
+ int32x4_t x[16];
+ const int32x4_t v_src = vld1q_s32(dst);
+ // stage 1.
+ s[1] = v_src;
+
+ Adst16DcOnlyInternal(s, x);
+
+ for (int j = 0; j < 16; ++j) {
+ vst1q_s32(&dst[j * width], x[j]);
+ }
+ i += 4;
+ dst += 4;
+ } while (i < width);
+
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// Identity Transforms.
+
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step, int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(shift_lo)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int tx_height) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int shift = tx_height < 16 ? 0 : 1;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_multiplier = vdupq_n_s32(kIdentity4Multiplier);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+ const int32x4_t v_src_mult_lo = vmlaq_s32(v_dual_round, v_src, v_multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+template <int identity_size>
+LIBGAV1_ALWAYS_INLINE void IdentityColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ static_assert(identity_size == 4 || identity_size == 8 || identity_size == 16,
+ "Invalid identity_size.");
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (1 << 4)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[i * 4]);
+ v_src.val[1] = vld1q_s32(&source[(i * 4) + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst);
+ frame_data.val[1] = vld1_u16(dst + stride);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + stride, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ dst += stride << 1;
+ i += 2;
+ } while (i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_dst_i, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ if (identity_size == 4) {
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity4Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity4Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ } else if (identity_size == 8) {
+ v_dst_i.val[0] = vaddq_s32(v_src.val[0], v_src.val[0]);
+ v_dst_i.val[1] = vaddq_s32(v_src.val[1], v_src.val[1]);
+ a.val[0] = vrshrq_n_s32(v_dst_i.val[0], 4);
+ a.val[1] = vrshrq_n_s32(v_dst_i.val[1], 4);
+ } else { // identity_size == 16
+ v_dst_i.val[0] =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ v_dst_i.val[1] =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ a.val[0] = vshrq_n_s32(v_dst_i.val[0], 4 + 12);
+ a.val[1] = vshrq_n_s32(v_dst_i.val[1], 4 + 12);
+ }
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity4RowColumnStoreToFrame(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int tx_height, const int32_t* source) {
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+ const int32x4_t v_round = vdupq_n_s32((1 + (0)) << 11);
+ const uint16x4_t v_max_bitdepth = vdup_n_u16((1 << kBitdepth10) - 1);
+
+ if (tx_width == 4) {
+ int i = 0;
+ do {
+ const int32x4_t v_src = vld1q_s32(&source[i * 4]);
+ const int32x4_t v_dst_row =
+ vshrq_n_s32(vmlaq_n_s32(v_round, v_src, kIdentity4Multiplier), 12);
+ const int32x4_t v_dst_col =
+ vmlaq_n_s32(v_round, v_dst_row, kIdentity4Multiplier);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(v_dst_col, 4 + 12);
+ const int32x4_t b = vaddw_s16(a, vreinterpret_s16_u16(frame_data));
+ vst1_u16(dst, vmin_u16(vqmovun_s32(b), v_max_bitdepth));
+ dst += stride;
+ } while (++i < tx_height);
+ } else {
+ int i = 0;
+ do {
+ const int row = i * tx_width;
+ int j = 0;
+ do {
+ int32x4x2_t v_src, v_src_round, v_dst_row, v_dst_col, a, b;
+ v_src.val[0] = vld1q_s32(&source[row + j]);
+ v_src.val[1] = vld1q_s32(&source[row + j + 4]);
+ v_src_round.val[0] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[0], kTransformRowMultiplier), 12);
+ v_src_round.val[1] = vshrq_n_s32(
+ vmlaq_n_s32(v_round, v_src.val[1], kTransformRowMultiplier), 12);
+ v_dst_row.val[0] = vqaddq_s32(v_src_round.val[0], v_src_round.val[0]);
+ v_dst_row.val[1] = vqaddq_s32(v_src_round.val[1], v_src_round.val[1]);
+ v_dst_col.val[0] =
+ vmlaq_n_s32(v_round, v_dst_row.val[0], kIdentity4Multiplier);
+ v_dst_col.val[1] =
+ vmlaq_n_s32(v_round, v_dst_row.val[1], kIdentity4Multiplier);
+ uint16x4x2_t frame_data;
+ frame_data.val[0] = vld1_u16(dst + j);
+ frame_data.val[1] = vld1_u16(dst + j + 4);
+ a.val[0] = vrshrq_n_s32(v_dst_col.val[0], 4 + 12);
+ a.val[1] = vrshrq_n_s32(v_dst_col.val[1], 4 + 12);
+ b.val[0] = vaddw_s16(a.val[0], vreinterpret_s16_u16(frame_data.val[0]));
+ b.val[1] = vaddw_s16(a.val[1], vreinterpret_s16_u16(frame_data.val[1]));
+ vst1_u16(dst + j, vmin_u16(vqmovun_s32(b.val[0]), v_max_bitdepth));
+ vst1_u16(dst + j + 4, vmin_u16(vqmovun_s32(b.val[1]), v_max_bitdepth));
+ j += 8;
+ } while (j < tx_width);
+ dst += stride;
+ } while (++i < tx_height);
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height equal to 32 can be simplified from
+ // ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t a_lo = vrshrq_n_s32(v_src_lo, 1);
+ const int32x4_t a_hi = vrshrq_n_s32(v_src_hi, 1);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(a_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(a_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
+ auto* const dst = static_cast<int32_t*>(dest);
+
+ for (int i = 0; i < 4; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&dst[i * step]);
+ const int32x4_t v_src_hi = vld1q_s32(&dst[(i * step) + 4]);
+ const int32x4_t v_srcx2_lo = vqaddq_s32(v_src_lo, v_src_lo);
+ const int32x4_t v_srcx2_hi = vqaddq_s32(v_src_hi, v_src_hi);
+ vst1q_s32(&dst[i * step], vmovl_s16(vqmovn_s32(v_srcx2_lo)));
+ vst1q_s32(&dst[(i * step) + 4], vmovl_s16(vqmovn_s32(v_srcx2_hi)));
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int row_shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_srcx2 = vaddq_s32(v_src, v_src);
+ const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
+ auto* const dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
+
+ for (int i = 0; i < 4; ++i) {
+ for (int j = 0; j < 2; ++j) {
+ int32x4x2_t v_src;
+ v_src.val[0] = vld1q_s32(&dst[i * step + j * 8]);
+ v_src.val[1] = vld1q_s32(&dst[i * step + j * 8 + 4]);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src.val[0], kIdentity16Multiplier);
+ const int32x4_t v_src_mult_hi =
+ vmlaq_n_s32(v_dual_round, v_src.val[1], kIdentity16Multiplier);
+ const int32x4_t shift_lo = vqshlq_s32(v_src_mult_lo, v_shift);
+ const int32x4_t shift_hi = vqshlq_s32(v_src_mult_hi, v_shift);
+ vst1q_s32(&dst[i * step + j * 8], vmovl_s16(vqmovn_s32(shift_lo)));
+ vst1q_s32(&dst[i * step + j * 8 + 4], vmovl_s16(vqmovn_s32(shift_hi)));
+ }
+ }
+}
+
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
+ bool should_round, int shift) {
+ if (adjusted_tx_height > 1) return false;
+
+ auto* dst = static_cast<int32_t*>(dest);
+ const int32x4_t v_src0 = vdupq_n_s32(dst[0]);
+ const uint32x4_t v_mask = vdupq_n_u32(should_round ? 0xffffffff : 0);
+ const int32x4_t v_src_round =
+ vqrdmulhq_n_s32(v_src0, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t v_src = vbslq_s32(v_mask, v_src_round, v_src0);
+ const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
+ const int32x4_t v_src_mult_lo =
+ vmlaq_n_s32(v_dual_round, v_src, kIdentity16Multiplier);
+ const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, vdupq_n_s32(-(12 + shift)));
+ vst1q_lane_s32(dst, vmovl_s16(vqmovn_s32(dst_0)), 0);
+ return true;
+}
+
+//------------------------------------------------------------------------------
+// row/column transform loops
+
+template <int tx_height>
+LIBGAV1_ALWAYS_INLINE void FlipColumns(int32_t* source, int tx_width) {
+ if (tx_width >= 16) {
+ int i = 0;
+ do {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ const int32x4_t c = vld1q_s32(&source[i + 8]);
+ const int32x4_t d = vld1q_s32(&source[i + 12]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ const int32x4_t c_rev = vrev64q_s32(c);
+ const int32x4_t d_rev = vrev64q_s32(d);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(d_rev, d_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(c_rev, c_rev, 2));
+ vst1q_s32(&source[i + 8], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 12], vextq_s32(a_rev, a_rev, 2));
+ i += 16;
+ } while (i < tx_width * tx_height);
+ } else if (tx_width == 8) {
+ for (int i = 0; i < 8 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(b_rev, b_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(a_rev, a_rev, 2));
+ }
+ } else {
+ // Process two rows per iteration.
+ for (int i = 0; i < 4 * tx_height; i += 8) {
+ // 00 01 02 03
+ const int32x4_t a = vld1q_s32(&source[i]);
+ const int32x4_t b = vld1q_s32(&source[i + 4]);
+ // 01 00 03 02
+ const int32x4_t a_rev = vrev64q_s32(a);
+ const int32x4_t b_rev = vrev64q_s32(b);
+ // 03 02 01 00
+ vst1q_s32(&source[i], vextq_s32(a_rev, a_rev, 2));
+ vst1q_s32(&source[i + 4], vextq_s32(b_rev, b_rev, 2));
+ }
+ }
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void ApplyRounding(int32_t* source, int num_rows) {
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t a_lo = vld1q_s32(&source[i]);
+ const int32x4_t a_hi = vld1q_s32(&source[i + 4]);
+ const int32x4_t b_lo =
+ vqrdmulhq_n_s32(a_lo, kTransformRowMultiplier << (31 - 12));
+ const int32x4_t b_hi =
+ vqrdmulhq_n_s32(a_hi, kTransformRowMultiplier << (31 - 12));
+ vst1q_s32(&source[i], b_lo);
+ vst1q_s32(&source[i + 4], b_hi);
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_width>
+LIBGAV1_ALWAYS_INLINE void RowShift(int32_t* source, int num_rows,
+ int row_shift) {
+ // vqrshlq_s32 will shift right if shift value is negative.
+ row_shift = -row_shift;
+
+ // Process two rows per iteration.
+ int i = 0;
+ do {
+ const int32x4_t residual0 = vld1q_s32(&source[i]);
+ const int32x4_t residual1 = vld1q_s32(&source[i + 4]);
+ vst1q_s32(&source[i], vqrshlq_s32(residual0, vdupq_n_s32(row_shift)));
+ vst1q_s32(&source[i + 4], vqrshlq_s32(residual1, vdupq_n_s32(row_shift)));
+ i += 8;
+ } while (i < tx_width * num_rows);
+}
+
+template <int tx_height, bool enable_flip_rows = false>
+LIBGAV1_ALWAYS_INLINE void StoreToFrameWithRound(
+ Array2DView<uint16_t> frame, const int start_x, const int start_y,
+ const int tx_width, const int32_t* source, TransformType tx_type) {
+ const bool flip_rows =
+ enable_flip_rows ? kTransformFlipRowsMask.Contains(tx_type) : false;
+ const int stride = frame.columns();
+ uint16_t* dst = frame[start_y] + start_x;
+
+ if (tx_width == 4) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int row = flip_rows ? (tx_height - i - 1) * 4 : i * 4;
+ const int32x4_t residual = vld1q_s32(&source[row]);
+ const uint16x4_t frame_data = vld1_u16(dst);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const uint32x4_t b = vaddw_u16(vreinterpretq_u32_s32(a), frame_data);
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ vst1_u16(dst, vmin_u16(d, vdup_n_u16((1 << kBitdepth10) - 1)));
+ dst += stride;
+ }
+ } else {
+ for (int i = 0; i < tx_height; ++i) {
+ const int y = start_y + i;
+ const int row = flip_rows ? (tx_height - i - 1) * tx_width : i * tx_width;
+ int j = 0;
+ do {
+ const int x = start_x + j;
+ const int32x4_t residual = vld1q_s32(&source[row + j]);
+ const int32x4_t residual_hi = vld1q_s32(&source[row + j + 4]);
+ const uint16x8_t frame_data = vld1q_u16(frame[y] + x);
+ const int32x4_t a = vrshrq_n_s32(residual, 4);
+ const int32x4_t a_hi = vrshrq_n_s32(residual_hi, 4);
+ const uint32x4_t b =
+ vaddw_u16(vreinterpretq_u32_s32(a), vget_low_u16(frame_data));
+ const uint32x4_t b_hi =
+ vaddw_u16(vreinterpretq_u32_s32(a_hi), vget_high_u16(frame_data));
+ const uint16x4_t d = vqmovun_s32(vreinterpretq_s32_u32(b));
+ const uint16x4_t d_hi = vqmovun_s32(vreinterpretq_s32_u32(b_hi));
+ vst1q_u16(frame[y] + x, vminq_u16(vcombine_u16(d, d_hi),
+ vdupq_n_u16((1 << kBitdepth10) - 1)));
+ j += 8;
+ } while (j < tx_width);
+ }
+ }
+}
+
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = (tx_height == 16);
+
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, /*step=*/4, /*is_row=*/true,
+ row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d dct8 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, /*step=*/8, /*is_row=*/true,
+ row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_4>(data, 16, /*is_row=*/true, row_shift);
+ data += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct16 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct32 rows in parallel per iteration.
+ Dct32_NEON(data, 32, /*is_row=*/true, row_shift);
+ data += 128;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<32>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ // Process 4 1d dct64 rows in parallel per iteration.
+ Dct64_NEON(data, 64, /*is_row=*/true, row_shift);
+ data += 128 * 2;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<64>(src, tx_width);
+ }
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
+}
+
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
+
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, /*step=*/4, /*is_row=*/true, row_shift);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst4 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst4_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, /*step=*/8,
+ /*transpose=*/true, row_shift);
+ data += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ // Process 4 1d adst8 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_4>(data, tx_width, /*transpose=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ assert(adjusted_tx_height % 4 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 4 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(src, 16, /*is_row=*/true, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
+ int i = tx_width;
+ auto* data = src;
+ do {
+ // Process 4 1d adst16 columns in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_4>(data, tx_width, /*is_row=*/false,
+ /*row_shift=*/0);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
+ tx_width, src, tx_type);
+}
+
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ const int shift = tx_height > 8 ? 1 : 0;
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON(src, /*step=*/4, shift);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ // Special case: Process row calculations during column transform call.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ (tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+ return;
+ }
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<4>(src, tx_width);
+ }
+
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
+ return;
+ }
+
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A. For 10bpp, A must be clamped to a signed 16
+ // bit value.
+ if ((tx_height & 0x18) != 0) {
+ for (int i = 0; i < tx_height; ++i) {
+ const int32x4_t v_src_lo = vld1q_s32(&src[i * 8]);
+ const int32x4_t v_src_hi = vld1q_s32(&src[(i * 8) + 4]);
+ vst1q_s32(&src[i * 8], vmovl_s16(vqmovn_s32(v_src_lo)));
+ vst1q_s32(&src[(i * 8) + 4], vmovl_s16(vqmovn_s32(v_src_hi)));
+ }
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<8>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, row_shift);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int32_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (kTransformFlipColumnsMask.Contains(tx_type)) {
+ FlipColumns<16>(src, tx_width);
+ }
+ auto& frame = *static_cast<Array2DView<uint16_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+//------------------------------------------------------------------------------
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ // Maximum transform size for Dct is 64.
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
+
+ // Maximum transform size for Adst is 16.
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
+
+ // Maximum transform size for Identity transform is 32.
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+}
+
+} // namespace
+
+void InverseTransformInit10bpp_NEON() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_ENABLE_NEON || LIBGAV1_MAX_BITDEPTH < 10
+namespace libgav1 {
+namespace dsp {
+
+void InverseTransformInit10bpp_NEON() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_ENABLE_NEON && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.cc b/libgav1/src/dsp/arm/inverse_transform_neon.cc
index 5ad53f6..315d5e9 100644
--- a/libgav1/src/dsp/arm/inverse_transform_neon.cc
+++ b/libgav1/src/dsp/arm/inverse_transform_neon.cc
@@ -85,6 +85,8 @@
out[3] = vcombine_s16(d3, d3);
}
+// Note this is only used in the final stage of Dct32/64 and Adst16 as the in
+// place version causes additional stack usage with clang.
LIBGAV1_ALWAYS_INLINE void Transpose8x8(const int16x8_t in[8],
int16x8_t out[8]) {
// Swap 16 bit elements. Goes from:
@@ -392,8 +394,8 @@
// inside 12 bits. This leaves room for the sign bit and the 3 left shifted
// bits.
assert(sin128 <= 0xfff);
- const int16x8_t x = vqrdmulhq_s16(*b, vdupq_n_s16(-sin128 << 3));
- const int16x8_t y = vqrdmulhq_s16(*b, vdupq_n_s16(cos128 << 3));
+ const int16x8_t x = vqrdmulhq_n_s16(*b, -sin128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*b, cos128 << 3);
if (flip) {
*a = y;
*b = x;
@@ -409,8 +411,8 @@
const bool flip) {
const int16_t cos128 = Cos128(angle);
const int16_t sin128 = Sin128(angle);
- const int16x8_t x = vqrdmulhq_s16(*a, vdupq_n_s16(cos128 << 3));
- const int16x8_t y = vqrdmulhq_s16(*a, vdupq_n_s16(sin128 << 3));
+ const int16x8_t x = vqrdmulhq_n_s16(*a, cos128 << 3);
+ const int16x8_t y = vqrdmulhq_n_s16(*a, sin128 << 3);
if (flip) {
*a = y;
*b = x;
@@ -441,23 +443,18 @@
// Discrete Cosine Transforms (DCT).
template <int width>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const int16x8_t v_src = vdupq_n_s16(src[0]);
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
const int16x8_t v_src_round =
vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
const int16x8_t s0 = vbslq_s16(v_mask, v_src_round, v_src);
const int16_t cos128 = Cos128(32);
- const int16x8_t xy = vqrdmulhq_s16(s0, vdupq_n_s16(cos128 << 3));
+ const int16x8_t xy = vqrdmulhq_n_s16(s0, cos128 << 3);
// vqrshlq_s16 will shift right if shift value is negative.
const int16x8_t xy_shifted = vqrshlq_s16(xy, vdupq_n_s16(-row_shift));
@@ -473,27 +470,23 @@
}
template <int height>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
const int16_t cos128 = Cos128(32);
// Calculate dc values for first row.
if (width == 4) {
- const int16x4_t v_src = vld1_s16(src);
- const int16x4_t xy = vqrdmulh_s16(v_src, vdup_n_s16(cos128 << 3));
+ const int16x4_t v_src = vld1_s16(dst);
+ const int16x4_t xy = vqrdmulh_n_s16(v_src, cos128 << 3);
vst1_s16(dst, xy);
} else {
int i = 0;
do {
- const int16x8_t v_src = vld1q_s16(&src[i]);
- const int16x8_t xy = vqrdmulhq_s16(v_src, vdupq_n_s16(cos128 << 3));
+ const int16x8_t v_src = vld1q_s16(&dst[i]);
+ const int16x8_t xy = vqrdmulhq_n_s16(v_src, cos128 << 3);
vst1q_s16(&dst[i], xy);
i += 8;
} while (i < width);
@@ -501,21 +494,21 @@
// Copy first row to the rest of the block.
for (int y = 1; y < height; ++y) {
- memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
}
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct4Stages(int16x8_t* s) {
// stage 12.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
} else {
- bufferfly_rotation(&s[0], &s[1], 32, true);
- bufferfly_rotation(&s[2], &s[3], 48, false);
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
}
// stage 17.
@@ -523,23 +516,21 @@
HadamardRotation(&s[1], &s[2], false);
}
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_NEON(void* dest, int32_t step, bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[4], x[4];
if (stage_is_rectangular) {
if (transpose) {
int16x8_t input[8];
- LoadSrc<8, 8>(src, step, 0, input);
+ LoadSrc<8, 8>(dst, step, 0, input);
Transpose4x8To8x4(input, x);
} else {
- LoadSrc<16, 4>(src, step, 0, x);
+ LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(src, step, 0, x);
+ LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
Transpose4x4(x, x);
}
@@ -552,7 +543,7 @@
s[2] = x[1];
s[3] = x[3];
- Dct4Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
if (stage_is_rectangular) {
if (transpose) {
@@ -570,16 +561,16 @@
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct8Stages(int16x8_t* s) {
// stage 8.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
} else {
- bufferfly_rotation(&s[4], &s[7], 56, false);
- bufferfly_rotation(&s[5], &s[6], 24, false);
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
}
// stage 13.
@@ -587,7 +578,7 @@
HadamardRotation(&s[6], &s[7], true);
// stage 18.
- bufferfly_rotation(&s[6], &s[5], 32, true);
+ butterfly_rotation(&s[6], &s[5], 32, true);
// stage 22.
HadamardRotation(&s[0], &s[7], false);
@@ -597,27 +588,24 @@
}
// Process dct8 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_NEON(void* dest, int32_t step, bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[8], x[8];
if (stage_is_rectangular) {
if (transpose) {
int16x8_t input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8(input, x);
} else {
- LoadSrc<8, 8>(src, step, 0, x);
+ LoadSrc<8, 8>(dst, step, 0, x);
}
} else if (transpose) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, 0, input);
- Transpose8x8(input, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
} else {
- LoadSrc<16, 8>(src, step, 0, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
}
// stage 1.
@@ -631,8 +619,8 @@
s[6] = x[3];
s[7] = x[7];
- Dct4Stages<bufferfly_rotation>(s);
- Dct8Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
if (stage_is_rectangular) {
if (transpose) {
@@ -643,28 +631,27 @@
StoreDst<8, 8>(dst, step, 0, s);
}
} else if (transpose) {
- int16x8_t output[8];
- Transpose8x8(s, output);
- StoreDst<16, 8>(dst, step, 0, output);
+ dsp::Transpose8x8(s);
+ StoreDst<16, 8>(dst, step, 0, s);
} else {
StoreDst<16, 8>(dst, step, 0, s);
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct16Stages(int16x8_t* s) {
// stage 5.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
} else {
- bufferfly_rotation(&s[8], &s[15], 60, false);
- bufferfly_rotation(&s[9], &s[14], 28, false);
- bufferfly_rotation(&s[10], &s[13], 44, false);
- bufferfly_rotation(&s[11], &s[12], 12, false);
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
}
// stage 9.
@@ -674,8 +661,8 @@
HadamardRotation(&s[14], &s[15], true);
// stage 14.
- bufferfly_rotation(&s[14], &s[9], 48, true);
- bufferfly_rotation(&s[13], &s[10], 112, true);
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
// stage 19.
HadamardRotation(&s[8], &s[11], false);
@@ -684,8 +671,8 @@
HadamardRotation(&s[13], &s[14], true);
// stage 23.
- bufferfly_rotation(&s[13], &s[10], 32, true);
- bufferfly_rotation(&s[12], &s[11], 32, true);
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
// stage 26.
HadamardRotation(&s[0], &s[15], false);
@@ -699,32 +686,29 @@
}
// Process dct16 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, const void* source,
- int32_t step, bool is_row,
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_NEON(void* dest, int32_t step, bool is_row,
int row_shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[16], x[16];
if (stage_is_rectangular) {
if (is_row) {
int16x8_t input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8(input, x);
- LoadSrc<16, 4>(src, step, 8, input);
+ LoadSrc<16, 4>(dst, step, 8, input);
Transpose8x4To4x8(input, &x[8]);
} else {
- LoadSrc<8, 16>(src, step, 0, x);
+ LoadSrc<8, 16>(dst, step, 0, x);
}
} else if (is_row) {
for (int idx = 0; idx < 16; idx += 8) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, idx, input);
- Transpose8x8(input, &x[idx]);
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
}
} else {
- LoadSrc<16, 16>(src, step, 0, x);
+ LoadSrc<16, 16>(dst, step, 0, x);
}
// stage 1
@@ -746,9 +730,9 @@
s[14] = x[7];
s[15] = x[15];
- Dct4Stages<bufferfly_rotation>(s);
- Dct8Stages<bufferfly_rotation>(s);
- Dct16Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
if (is_row) {
const int16x8_t v_row_shift = vdupq_n_s16(-row_shift);
@@ -769,16 +753,15 @@
}
} else if (is_row) {
for (int idx = 0; idx < 16; idx += 8) {
- int16x8_t output[8];
- Transpose8x8(&s[idx], output);
- StoreDst<16, 8>(dst, step, idx, output);
+ dsp::Transpose8x8(&s[idx]);
+ StoreDst<16, 8>(dst, step, idx, &s[idx]);
}
} else {
StoreDst<16, 16>(dst, step, 0, s);
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
+template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct32Stages(int16x8_t* s) {
// stage 3
@@ -792,14 +775,14 @@
ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
} else {
- bufferfly_rotation(&s[16], &s[31], 62, false);
- bufferfly_rotation(&s[17], &s[30], 30, false);
- bufferfly_rotation(&s[18], &s[29], 46, false);
- bufferfly_rotation(&s[19], &s[28], 14, false);
- bufferfly_rotation(&s[20], &s[27], 54, false);
- bufferfly_rotation(&s[21], &s[26], 22, false);
- bufferfly_rotation(&s[22], &s[25], 38, false);
- bufferfly_rotation(&s[23], &s[24], 6, false);
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
}
// stage 6.
HadamardRotation(&s[16], &s[17], false);
@@ -812,10 +795,10 @@
HadamardRotation(&s[30], &s[31], true);
// stage 10.
- bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
- bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
- bufferfly_rotation(&s[26], &s[21], 24, true);
- bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
// stage 15.
HadamardRotation(&s[16], &s[19], false);
@@ -828,10 +811,10 @@
HadamardRotation(&s[29], &s[30], true);
// stage 20.
- bufferfly_rotation(&s[29], &s[18], 48, true);
- bufferfly_rotation(&s[28], &s[19], 48, true);
- bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
- bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
// stage 24.
HadamardRotation(&s[16], &s[23], false);
@@ -844,10 +827,10 @@
HadamardRotation(&s[27], &s[28], true);
// stage 27.
- bufferfly_rotation(&s[27], &s[20], 32, true);
- bufferfly_rotation(&s[26], &s[21], 32, true);
- bufferfly_rotation(&s[25], &s[22], 32, true);
- bufferfly_rotation(&s[24], &s[23], 32, true);
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
// stage 29.
HadamardRotation(&s[0], &s[31], false);
@@ -869,21 +852,18 @@
}
// Process dct32 rows or columns, depending on the transpose flag.
-LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const void* source,
- const int32_t step, const bool is_row,
- int row_shift) {
+LIBGAV1_ALWAYS_INLINE void Dct32_NEON(void* dest, const int32_t step,
+ const bool is_row, int row_shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[32], x[32];
if (is_row) {
for (int idx = 0; idx < 32; idx += 8) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, idx, input);
- Transpose8x8(input, &x[idx]);
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
}
} else {
- LoadSrc<16, 32>(src, step, 0, x);
+ LoadSrc<16, 32>(dst, step, 0, x);
}
// stage 1
@@ -946,24 +926,21 @@
// Allow the compiler to call this function instead of force inlining. Tests
// show the performance is slightly faster.
-void Dct64_NEON(void* dest, const void* source, int32_t step, bool is_row,
- int row_shift) {
+void Dct64_NEON(void* dest, int32_t step, bool is_row, int row_shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[64], x[32];
if (is_row) {
// The last 32 values of every row are always zero if the |tx_width| is
// 64.
for (int idx = 0; idx < 32; idx += 8) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, idx, input);
- Transpose8x8(input, &x[idx]);
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
}
} else {
// The last 32 values of every column are always zero if the |tx_height| is
// 64.
- LoadSrc<16, 32>(src, step, 0, x);
+ LoadSrc<16, 32>(dst, step, 0, x);
}
// stage 1
@@ -1171,23 +1148,22 @@
//------------------------------------------------------------------------------
// Asymmetric Discrete Sine Transforms (ADST).
template <bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, const void* source,
- int32_t step, bool transpose) {
+LIBGAV1_ALWAYS_INLINE void Adst4_NEON(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int32x4_t s[8];
int16x8_t x[4];
if (stage_is_rectangular) {
if (transpose) {
int16x8_t input[8];
- LoadSrc<8, 8>(src, step, 0, input);
+ LoadSrc<8, 8>(dst, step, 0, input);
Transpose4x8To8x4(input, x);
} else {
- LoadSrc<16, 4>(src, step, 0, x);
+ LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(src, step, 0, x);
+ LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
Transpose4x4(x, x);
}
@@ -1250,18 +1226,14 @@
alignas(8) constexpr int16_t kAdst4DcOnlyMultiplier[4] = {1321, 2482, 3344,
2482};
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int32x4_t s[2];
- const int16x4_t v_src0 = vdup_n_s16(src[0]);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
const int16x4_t v_src_round =
vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -1283,21 +1255,16 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
int32x4_t s[4];
int i = 0;
do {
- const int16x4_t v_src = vld1_s16(&src[i]);
+ const int16x4_t v_src = vld1_s16(&dst[i]);
s[0] = vmull_n_s16(v_src, kAdst4Multiplier[0]);
s[1] = vmull_n_s16(v_src, kAdst4Multiplier[1]);
@@ -1323,28 +1290,26 @@
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_NEON(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[8], x[8];
if (stage_is_rectangular) {
if (transpose) {
int16x8_t input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8(input, x);
} else {
- LoadSrc<8, 8>(src, step, 0, x);
+ LoadSrc<8, 8>(dst, step, 0, x);
}
} else {
if (transpose) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, 0, input);
- Transpose8x8(input, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
+ dsp::Transpose8x8(x);
} else {
- LoadSrc<16, 8>(src, step, 0, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
}
}
@@ -1359,10 +1324,10 @@
s[7] = x[6];
// stage 2.
- bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
- bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
- bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
- bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
// stage 3.
HadamardRotation(&s[0], &s[4], false);
@@ -1371,8 +1336,8 @@
HadamardRotation(&s[3], &s[7], false);
// stage 4.
- bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
- bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
// stage 5.
HadamardRotation(&s[0], &s[2], false);
@@ -1381,8 +1346,8 @@
HadamardRotation(&s[5], &s[7], false);
// stage 6.
- bufferfly_rotation(&s[2], &s[3], 32, true);
- bufferfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
// stage 7.
x[0] = s[0];
@@ -1404,27 +1369,22 @@
}
} else {
if (transpose) {
- int16x8_t output[8];
- Transpose8x8(x, output);
- StoreDst<16, 8>(dst, step, 0, output);
+ dsp::Transpose8x8(x);
+ StoreDst<16, 8>(dst, step, 0, x);
} else {
StoreDst<16, 8>(dst, step, 0, x);
}
}
}
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[8];
- const int16x8_t v_src = vdupq_n_s16(src[0]);
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
const int16x8_t v_src_round =
vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
@@ -1471,20 +1431,16 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[8];
int i = 0;
do {
- const int16x8_t v_src = vld1q_s16(&src[i]);
+ const int16x8_t v_src = vld1q_s16(dst);
// stage 1.
s[1] = v_src;
@@ -1529,33 +1485,30 @@
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, const void* source,
- int32_t step, bool is_row,
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_NEON(void* dest, int32_t step, bool is_row,
int row_shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[16], x[16];
if (stage_is_rectangular) {
if (is_row) {
int16x8_t input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8(input, x);
- LoadSrc<16, 4>(src, step, 8, input);
+ LoadSrc<16, 4>(dst, step, 8, input);
Transpose8x4To4x8(input, &x[8]);
} else {
- LoadSrc<8, 16>(src, step, 0, x);
+ LoadSrc<8, 16>(dst, step, 0, x);
}
} else {
if (is_row) {
for (int idx = 0; idx < 16; idx += 8) {
- int16x8_t input[8];
- LoadSrc<16, 8>(src, step, idx, input);
- Transpose8x8(input, &x[idx]);
+ LoadSrc<16, 8>(dst, step, idx, &x[idx]);
+ dsp::Transpose8x8(&x[idx]);
}
} else {
- LoadSrc<16, 16>(src, step, 0, x);
+ LoadSrc<16, 16>(dst, step, 0, x);
}
}
@@ -1578,14 +1531,14 @@
s[15] = x[14];
// stage 2.
- bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
- bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
- bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
- bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
- bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
- bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
- bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
- bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
// stage 3.
HadamardRotation(&s[0], &s[8], false);
@@ -1598,10 +1551,10 @@
HadamardRotation(&s[7], &s[15], false);
// stage 4.
- bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
- bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
- bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
- bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
// stage 5.
HadamardRotation(&s[0], &s[4], false);
@@ -1614,10 +1567,10 @@
HadamardRotation(&s[11], &s[15], false);
// stage 6.
- bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
- bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
- bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
- bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
// stage 7.
HadamardRotation(&s[0], &s[2], false);
@@ -1630,10 +1583,10 @@
HadamardRotation(&s[13], &s[15], false);
// stage 8.
- bufferfly_rotation(&s[2], &s[3], 32, true);
- bufferfly_rotation(&s[6], &s[7], 32, true);
- bufferfly_rotation(&s[10], &s[11], 32, true);
- bufferfly_rotation(&s[14], &s[15], 32, true);
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
// stage 9.
x[0] = s[0];
@@ -1743,19 +1696,15 @@
x[15] = vqnegq_s16(s[1]);
}
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
int16x8_t s[16];
int16x8_t x[16];
- const int16x8_t v_src = vdupq_n_s16(src[0]);
+ const int16x8_t v_src = vdupq_n_s16(dst[0]);
const uint16x8_t v_mask = vdupq_n_u16(should_round ? 0xffff : 0);
const int16x8_t v_src_round =
vqrdmulhq_n_s16(v_src, kTransformRowMultiplier << 3);
@@ -1773,21 +1722,17 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
int i = 0;
do {
int16x8_t s[16];
int16x8_t x[16];
- const int16x8_t v_src = vld1q_s16(&src[i]);
+ const int16x8_t v_src = vld1q_s16(dst);
// stage 1.
s[1] = v_src;
@@ -1807,10 +1752,8 @@
// Identity Transforms.
template <bool is_row_shift>
-LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity4_NEON(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
if (is_row_shift) {
const int shift = 1;
@@ -1818,7 +1761,7 @@
const int16x4_t v_multiplier = vdup_n_s16(kIdentity4Multiplier);
const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
for (int i = 0; i < 4; i += 2) {
- const int16x8_t v_src = vld1q_s16(&src[i * step]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
const int32x4_t v_src_mult_lo =
vmlal_s16(v_dual_round, vget_low_s16(v_src), v_multiplier);
const int32x4_t v_src_mult_hi =
@@ -1830,7 +1773,7 @@
}
} else {
for (int i = 0; i < 4; i += 2) {
- const int16x8_t v_src = vld1q_s16(&src[i * step]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
const int16x8_t a =
vqrdmulhq_n_s16(v_src, kIdentity4MultiplierFraction << 3);
const int16x8_t b = vqaddq_s16(v_src, a);
@@ -1839,17 +1782,12 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int tx_height) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const int16x4_t v_src0 = vdup_n_s16(src[0]);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
const int16x4_t v_src_round =
vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -1860,7 +1798,7 @@
const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
const int32x4_t v_src_mult_lo = vmlal_s16(v_dual_round, v_src, v_multiplier);
const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
- vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
return true;
}
@@ -2001,28 +1939,24 @@
}
}
-LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_NEON(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
// When combining the identity8 multiplier with the row shift, the
// calculations for tx_height equal to 32 can be simplified from
// ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
for (int i = 0; i < 4; ++i) {
- const int16x8_t v_src = vld1q_s16(&src[i * step]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
const int16x8_t a = vrshrq_n_s16(v_src, 1);
vst1q_s16(&dst[i * step], a);
}
}
-LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_NEON(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
for (int i = 0; i < 4; ++i) {
- const int16x8_t v_src = vld1q_s16(&src[i * step]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step]);
// For bitdepth == 8, the identity row clamps to a signed 16bit value, so
// saturating add here is ok.
const int16x8_t v_srcx2 = vqaddq_s16(v_src, v_src);
@@ -2030,36 +1964,31 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
- const int16x4_t v_src0 = vdup_n_s16(src[0]);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
const int16x4_t v_src_round =
vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
const int16x4_t v_src = vbsl_s16(v_mask, v_src_round, v_src0);
const int32x4_t v_srcx2 = vaddl_s16(v_src, v_src);
const int32x4_t dst_0 = vqrshlq_s32(v_srcx2, vdupq_n_s32(-row_shift));
- vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
return true;
}
-LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, const void* source,
- int32_t step, int shift) {
+LIBGAV1_ALWAYS_INLINE void Identity16Row_NEON(void* dest, int32_t step,
+ int shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
const int32x4_t v_dual_round = vdupq_n_s32((1 + (shift << 1)) << 11);
const int32x4_t v_shift = vdupq_n_s32(-(12 + shift));
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 2; ++j) {
- const int16x8_t v_src = vld1q_s16(&src[i * step + j * 8]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j * 8]);
const int32x4_t v_src_mult_lo =
vmlal_n_s16(v_dual_round, vget_low_s16(v_src), kIdentity16Multiplier);
const int32x4_t v_src_mult_hi = vmlal_n_s16(
@@ -2072,17 +2001,12 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const int16x4_t v_src0 = vdup_n_s16(src[0]);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
const uint16x4_t v_mask = vdup_n_u16(should_round ? 0xffff : 0);
const int16x4_t v_src_round =
vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
@@ -2093,21 +2017,20 @@
const int32x4_t v_src_mult_lo =
vmlal_s16(v_dual_round, (v_src), v_multiplier);
const int32x4_t dst_0 = vqshlq_s32(v_src_mult_lo, v_shift);
- vst1_lane_s16(&dst[0], vqmovn_s32(dst_0), 0);
+ vst1_lane_s16(dst, vqmovn_s32(dst_0), 0);
return true;
}
-LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest, const void* source,
+LIBGAV1_ALWAYS_INLINE void Identity32Row16_NEON(void* dest,
const int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
// When combining the identity32 multiplier with the row shift, the
// calculation for tx_height equal to 16 can be simplified from
// ((A * 4) + 1) >> 1) to (A * 2).
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 32; j += 8) {
- const int16x8_t v_src = vld1q_s16(&src[i * step + j]);
+ const int16x8_t v_src = vld1q_s16(&dst[i * step + j]);
// For bitdepth == 8, the identity row clamps to a signed 16bit value, so
// saturating add here is ok.
const int16x8_t v_dst_i = vqaddq_s16(v_src, v_src);
@@ -2116,21 +2039,18 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
- int non_zero_coeff_count) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
- const int16x4_t v_src0 = vdup_n_s16(src[0]);
+ const int16x4_t v_src0 = vdup_n_s16(dst[0]);
const int16x4_t v_src = vqrdmulh_n_s16(v_src0, kTransformRowMultiplier << 3);
// When combining the identity32 multiplier with the row shift, the
// calculation for tx_height equal to 16 can be simplified from
// ((A * 4) + 1) >> 1) to (A * 2).
const int16x4_t v_dst_0 = vqadd_s16(v_src, v_src);
- vst1_lane_s16(&dst[0], v_dst_0, 0);
+ vst1_lane_s16(dst, v_dst_0, 0);
return true;
}
@@ -2188,11 +2108,11 @@
// Process 4 wht4 rows and columns.
LIBGAV1_ALWAYS_INLINE void Wht4_NEON(uint8_t* dst, const int dst_stride,
const void* source,
- const int non_zero_coeff_count) {
+ const int adjusted_tx_height) {
const auto* const src = static_cast<const int16_t*>(source);
int16x4_t s[4];
- if (non_zero_coeff_count == 1) {
+ if (adjusted_tx_height == 1) {
// Special case: only src[0] is nonzero.
// src[0] 0 0 0
// 0 0 0 0
@@ -2411,533 +2331,531 @@
}
}
-void Dct4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct4TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = (tx_height == 16);
- if (is_row) {
- const bool should_round = (tx_height == 8);
- const int row_shift = (tx_height == 16);
-
- if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct4 rows in parallel.
- Dct4_NEON<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
- /*transpose=*/true);
- } else {
- // Process 8 1d dct4 rows in parallel per iteration.
- int i = 0;
- do {
- Dct4_NEON<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
- /*step=*/4, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (tx_height == 16) {
- RowShift<4>(src, num_rows, 1);
- }
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_NEON<ButterflyRotation_4, false>(src, /*step=*/4, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct4_NEON<ButterflyRotation_8, true>(data, /*step=*/4,
+ /*transpose=*/true);
+ data += 32;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<4>(src, tx_width);
}
- if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct4 columns in parallel.
- Dct4_NEON<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
- /*transpose=*/false);
+ Dct4_NEON<ButterflyRotation_4, false>(src, tx_width, /*transpose=*/false);
} else {
// Process 8 1d dct4 columns in parallel per iteration.
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
- Dct4_NEON<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
+ Dct4_NEON<ButterflyRotation_8, true>(data, tx_width,
/*transpose=*/false);
- i += 8;
- } while (i < tx_width);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
}
}
+
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<4>(frame, start_x, start_y, tx_width, src, tx_type);
}
-void Dct8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct8TransformLoopRow_NEON(TransformType /*tx_type*/, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct8 rows in parallel.
- Dct8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
- /*transpose=*/true);
- } else {
- // Process 8 1d dct8 rows in parallel per iteration.
- int i = 0;
- do {
- Dct8_NEON<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
- /*step=*/8, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (row_shift > 0) {
- RowShift<8>(src, num_rows, row_shift);
- }
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Dct8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct8 columns in parallel.
- Dct8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Dct8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
// Process 8 1d dct8 columns in parallel per iteration.
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
- Dct8_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Dct8_NEON<ButterflyRotation_8, false>(data, tx_width,
/*transpose=*/false);
- i += 8;
- } while (i < tx_width);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<8>(frame, start_x, start_y, tx_width, src, tx_type);
}
-void Dct16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct16 rows in parallel.
- Dct16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 16,
- /*is_row=*/true, row_shift);
- } else {
- int i = 0;
- do {
- // Process 8 1d dct16 rows in parallel per iteration.
- Dct16_NEON<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
- /*is_row=*/true, row_shift);
- i += 8;
- } while (i < num_rows);
- }
-
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Dct16TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct16 columns in parallel.
- Dct16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*is_row=*/false, /*row_shift=*/0);
+ Dct16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
} else {
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
// Process 8 1d dct16 columns in parallel per iteration.
- Dct16_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
- /*is_row=*/false,
+ Dct16_NEON<ButterflyRotation_8, false>(data, tx_width, /*is_row=*/false,
/*row_shift=*/0);
- i += 8;
- } while (i < tx_width);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<16>(frame, start_x, start_y, tx_width, src, tx_type);
}
-void Dct32TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<32>(src, num_rows);
- }
- // Process 8 1d dct32 rows in parallel per iteration.
- int i = 0;
- do {
- Dct32_NEON(&src[i * 32], &src[i * 32], 32, /*is_row=*/true, row_shift);
- i += 8;
- } while (i < num_rows);
-
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
- if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
- // Process 8 1d dct32 columns in parallel per iteration.
- int i = 0;
- do {
- Dct32_NEON(&src[i], &src[i], tx_width, /*is_row=*/false, /*row_shift=*/0);
- i += 8;
- } while (i < tx_width);
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
}
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_NEON(&src[i * 32], 32, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct32TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct32 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct32_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<32>(frame, start_x, start_y, tx_width, src, tx_type);
}
-void Dct64TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct64TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<64>(src, num_rows);
- }
- // Process 8 1d dct64 rows in parallel per iteration.
- int i = 0;
- do {
- Dct64_NEON(&src[i * 64], &src[i * 64], 64, /*is_row=*/true, row_shift);
- i += 8;
- } while (i < num_rows);
-
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
- if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
- // Process 8 1d dct64 columns in parallel per iteration.
- int i = 0;
- do {
- Dct64_NEON(&src[i], &src[i], tx_width, /*is_row=*/false, /*row_shift=*/0);
- i += 8;
- } while (i < tx_width);
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
}
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_NEON(&src[i * 64], 64, /*is_row=*/true, row_shift);
+ i += 8;
+ } while (i < adjusted_tx_height);
+}
+
+void Dct64TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
+ // Process 8 1d dct64 columns in parallel per iteration.
+ int i = tx_width;
+ auto* data = src;
+ do {
+ Dct64_NEON(data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
+ }
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<64>(frame, start_x, start_y, tx_width, src, tx_type);
}
-void Adst4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst4TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
- if (is_row) {
- const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
- const bool should_round = (tx_height == 8);
-
- if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
-
- // Process 4 1d adst4 rows in parallel per iteration.
- int i = 0;
- do {
- Adst4_NEON<false>(&src[i * 4], &src[i * 4], /*step=*/4,
- /*transpose=*/true);
- i += 4;
- } while (i < num_rows);
-
- if (tx_height == 16) {
- RowShift<4>(src, num_rows, 1);
- }
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst4_NEON<false>(data, /*step=*/4, /*transpose=*/true);
+ data += 16;
+ i -= 4;
+ } while (i != 0);
+
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<4>(src, tx_width);
}
- if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
// Process 4 1d adst4 columns in parallel per iteration.
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
- Adst4_NEON<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
- i += 4;
- } while (i < tx_width);
+ Adst4_NEON<false>(data, tx_width, /*transpose=*/false);
+ data += 4;
+ i -= 4;
+ } while (i != 0);
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<4, /*enable_flip_rows=*/true>(frame, start_x, start_y,
tx_width, src, tx_type);
}
-void Adst8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst8TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d adst8 rows in parallel.
- Adst8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
- /*transpose=*/true);
- } else {
- // Process 8 1d adst8 rows in parallel per iteration.
- int i = 0;
- do {
- Adst8_NEON<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
- /*step=*/8,
- /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (row_shift > 0) {
- RowShift<8>(src, num_rows, row_shift);
- }
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_NEON<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ auto* data = src;
+ do {
+ Adst8_NEON<ButterflyRotation_8, false>(data, /*step=*/8,
+ /*transpose=*/true);
+ data += 64;
+ i -= 8;
+ } while (i != 0);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d adst8 columns in parallel.
- Adst8_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Adst8_NEON<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
// Process 8 1d adst8 columns in parallel per iteration.
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
- Adst8_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Adst8_NEON<ButterflyRotation_8, false>(data, tx_width,
/*transpose=*/false);
- i += 8;
- } while (i < tx_width);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<8, /*enable_flip_rows=*/true>(frame, start_x, start_y,
tx_width, src, tx_type);
}
-void Adst16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d adst16 rows in parallel.
- Adst16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 16,
- /*is_row=*/true, row_shift);
- } else {
- int i = 0;
- do {
- // Process 8 1d adst16 rows in parallel per iteration.
- Adst16_NEON<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
- /*is_row=*/true, row_shift);
- i += 8;
- } while (i < num_rows);
- }
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height == 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_NEON<ButterflyRotation_4, true>(src, 16, /*is_row=*/true, row_shift);
+ } else {
+ assert(adjusted_tx_height % 8 == 0);
+ int i = adjusted_tx_height;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_NEON<ButterflyRotation_8, false>(src, 16, /*is_row=*/true,
+ row_shift);
+ src += 128;
+ i -= 8;
+ } while (i != 0);
+ }
+}
+
+void Adst16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d adst16 columns in parallel.
- Adst16_NEON<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*is_row=*/false, /*row_shift=*/0);
+ Adst16_NEON<ButterflyRotation_4, true>(src, 4, /*is_row=*/false,
+ /*row_shift=*/0);
} else {
- int i = 0;
+ int i = tx_width;
+ auto* data = src;
do {
// Process 8 1d adst16 columns in parallel per iteration.
- Adst16_NEON<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
- /*is_row=*/false,
- /*row_shift=*/0);
- i += 8;
- } while (i < tx_width);
+ Adst16_NEON<ButterflyRotation_8, false>(
+ data, tx_width, /*is_row=*/false, /*row_shift=*/0);
+ data += 8;
+ i -= 8;
+ } while (i != 0);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound<16, /*enable_flip_rows=*/true>(frame, start_x, start_y,
tx_width, src, tx_type);
}
-void Identity4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
+void Identity4TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<false>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ } else {
+ int i = adjusted_tx_height;
+ do {
+ Identity4_NEON<true>(src, /*step=*/4);
+ src += 16;
+ i -= 4;
+ } while (i != 0);
+ }
+}
+
+void Identity4TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
- if (is_row) {
- // Special case: Process row calculations during column transform call.
- // Improves performance.
- if (tx_type == kTransformTypeIdentityIdentity &&
- tx_size == kTransformSize4x4) {
- return;
- }
-
- const bool should_round = (tx_height == 8);
-
- if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- tx_height)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
- if (tx_height < 16) {
- int i = 0;
- do {
- Identity4_NEON<false>(&src[i * 4], &src[i * 4], /*step=*/4);
- i += 4;
- } while (i < num_rows);
- } else {
- int i = 0;
- do {
- Identity4_NEON<true>(&src[i * 4], &src[i * 4], /*step=*/4);
- i += 4;
- } while (i < num_rows);
- }
- return;
- }
- assert(!is_row);
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
// Special case: Process row calculations during column transform call.
if (tx_type == kTransformTypeIdentityIdentity &&
(tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
- Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
- src);
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
return;
}
@@ -2945,168 +2863,185 @@
FlipColumns<4>(src, tx_width);
}
- IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width, height, src);
+ IdentityColumnStoreToFrame<4>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity8TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
- auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
-
- if (is_row) {
- // Special case: Process row calculations during column transform call.
- // Improves performance.
- if (tx_type == kTransformTypeIdentityIdentity &&
- tx_size == kTransformSize8x4) {
- return;
- }
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- // When combining the identity8 multiplier with the row shift, the
- // calculations for tx_height == 8 and tx_height == 16 can be simplified
- // from ((A * 2) + 1) >> 1) to A.
- if ((tx_height & 0x18) != 0) {
- return;
- }
- if (tx_height == 32) {
- for (int i = 0; i < num_rows; i += 4) {
- Identity8Row32_NEON(&src[i * 8], &src[i * 8], /*step=*/8);
- }
- return;
- }
-
- // Process kTransformSize8x4
- assert(tx_size == kTransformSize8x4);
- for (int i = 0; i < num_rows; i += 4) {
- Identity8Row4_NEON(&src[i * 8], &src[i * 8], /*step=*/8);
- }
+void Identity8TransformLoopRow_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
return;
}
- assert(!is_row);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row32_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = adjusted_tx_height;
+ do {
+ Identity8Row4_NEON(src, /*step=*/8);
+ src += 32;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity8TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width, height, src);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<8>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity16TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Identity16TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
- for (int i = 0; i < num_rows; i += 4) {
- Identity16Row_NEON(&src[i * 16], &src[i * 16], /*step=*/16,
- kTransformRowShift[tx_size]);
- }
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = adjusted_tx_height;
+ do {
+ Identity16Row_NEON(src, /*step=*/16, kTransformRowShift[tx_size]);
+ src += 64;
+ i -= 4;
+ } while (i != 0);
+}
+
+void Identity16TransformLoopColumn_NEON(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width, height,
- src);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ IdentityColumnStoreToFrame<16>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity32TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
- auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
+void Identity32TransformLoopRow_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
const int tx_height = kTransformHeight[tx_size];
- if (is_row) {
- // When combining the identity32 multiplier with the row shift, the
- // calculations for tx_height == 8 and tx_height == 32 can be simplified
- // from ((A * 4) + 2) >> 2) to A.
- if ((tx_height & 0x28) != 0) {
- return;
- }
-
- // Process kTransformSize32x16. The src is always rounded before the
- // identity transform and shifted by 1 afterwards.
-
- if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
-
- assert(tx_size == kTransformSize32x16);
- ApplyRounding<32>(src, num_rows);
- for (int i = 0; i < num_rows; i += 4) {
- Identity32Row16_NEON(&src[i * 32], &src[i * 32], /*step=*/32);
- }
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
return;
}
- assert(!is_row);
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width, height,
- src);
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = adjusted_tx_height;
+ do {
+ Identity32Row16_NEON(src, /*step=*/32);
+ src += 128;
+ i -= 4;
+ } while (i != 0);
}
-void Wht4TransformLoop_NEON(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
+void Identity32TransformLoopColumn_NEON(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ IdentityColumnStoreToFrame<32>(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_NEON(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/, void* /*src_buffer*/,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
assert(tx_type == kTransformTypeDctDct);
assert(tx_size == kTransformSize4x4);
static_cast<void>(tx_type);
static_cast<void>(tx_size);
- if (is_row) {
- // Do both row and column transforms in the column-transform pass.
- return;
- }
+ // Do both row and column transforms in the column-transform pass.
+}
- assert(!is_row);
+void Wht4TransformLoopColumn_NEON(TransformType tx_type, TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
// Process 4 1d wht4 rows and columns in parallel.
- auto* src = static_cast<int16_t*>(src_buffer);
+ const auto* src = static_cast<int16_t*>(src_buffer);
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
uint8_t* dst = frame[start_y] + start_x;
const int dst_stride = frame.columns();
- Wht4_NEON(dst, dst_stride, src, non_zero_coeff_count);
+ Wht4_NEON(dst, dst_stride, src, adjusted_tx_height);
}
//------------------------------------------------------------------------------
@@ -3115,38 +3050,64 @@
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
// Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
- Dct4TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
- Dct8TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
- Dct16TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
- Dct32TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
- Dct64TransformLoop_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_NEON;
// Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
- Adst4TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
- Adst8TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
- Adst16TransformLoop_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_NEON;
// Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
- Identity4TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
- Identity8TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
- Identity16TransformLoop_NEON;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
- Identity32TransformLoop_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_NEON;
// Maximum transform size for Wht is 4.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
- Wht4TransformLoop_NEON;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_NEON;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_NEON;
}
} // namespace
@@ -3156,7 +3117,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/inverse_transform_neon.h b/libgav1/src/dsp/arm/inverse_transform_neon.h
index af647e8..91e0e83 100644
--- a/libgav1/src/dsp/arm/inverse_transform_neon.h
+++ b/libgav1/src/dsp/arm/inverse_transform_neon.h
@@ -26,6 +26,7 @@
// Initializes Dsp::inverse_transforms, see the defines below for specifics.
// This function is not thread-safe.
void InverseTransformInit_NEON();
+void InverseTransformInit10bpp_NEON();
} // namespace dsp
} // namespace libgav1
@@ -47,6 +48,21 @@
#define LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity LIBGAV1_CPU_NEON
#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity LIBGAV1_CPU_NEON
+
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_INVERSE_TRANSFORM_NEON_H_
diff --git a/libgav1/src/dsp/arm/loop_filter_neon.cc b/libgav1/src/dsp/arm/loop_filter_neon.cc
index 146c983..8d72892 100644
--- a/libgav1/src/dsp/arm/loop_filter_neon.cc
+++ b/libgav1/src/dsp/arm/loop_filter_neon.cc
@@ -35,7 +35,7 @@
// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
inline uint8x8_t Hev(const uint8x8_t abd_p0p1_q0q1, const uint8_t thresh) {
const uint8x8_t a = vcgt_u8(abd_p0p1_q0q1, vdup_n_u8(thresh));
- return vorr_u8(a, RightShift<32>(a));
+ return vorr_u8(a, RightShiftVector<32>(a));
}
// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
@@ -44,7 +44,7 @@
const uint8x8x2_t a = Interleave32(p0q0, p1q1);
const uint8x8_t b = vabd_u8(a.val[0], a.val[1]);
const uint8x8_t p0q0_double = vqadd_u8(b, b);
- const uint8x8_t p1q1_half = RightShift<32>(vshr_n_u8(b, 1));
+ const uint8x8_t p1q1_half = RightShiftVector<32>(vshr_n_u8(b, 1));
const uint8x8_t c = vqadd_u8(p0q0_double, p1q1_half);
return vcle_u8(c, vdup_n_u8(outer_thresh));
}
@@ -56,7 +56,7 @@
const uint8_t inner_thresh,
const uint8_t outer_thresh) {
const uint8x8_t a = vcle_u8(abd_p0p1_q0q1, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(a, RightShift<32>(a));
+ const uint8x8_t inner_mask = vand_u8(a, RightShiftVector<32>(a));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -121,7 +121,7 @@
vcombine_s16(vget_low_s16(p0q1_l), vget_low_s16(q0p1_l));
// Need to shift the second term or we end up with a2_ma2.
const int8x8_t a2_ma1 =
- InterleaveLow32(a2_a1, RightShift<32>(vneg_s8(a2_a1)));
+ InterleaveLow32(a2_a1, RightShiftVector<32>(vneg_s8(a2_a1)));
const int16x8_t p0q0_a = vaddw_s8(p0q0_l, a2_ma1);
*p1q1_result = vqmovun_s16(p1q1_a3);
@@ -251,7 +251,7 @@
const uint8x8_t abd_p0p2_q0q2) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p0p2_q0q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(1));
- return vand_u8(b, RightShift<32>(b));
+ return vand_u8(b, RightShiftVector<32>(b));
}
// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
@@ -264,7 +264,7 @@
const uint8_t outer_thresh) {
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vcle_u8(a, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(b, RightShift<32>(b));
+ const uint8x8_t inner_mask = vand_u8(b, RightShiftVector<32>(b));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -482,7 +482,7 @@
const uint8x8_t a = vmax_u8(abd_p0n0_q0n0, abd_p0n1_q0n1);
const uint8x8_t b = vmax_u8(a, abd_p0n2_q0n2);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(1));
- return vand_u8(c, RightShift<32>(c));
+ return vand_u8(c, RightShiftVector<32>(c));
}
// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
@@ -498,7 +498,7 @@
const uint8x8_t a = vmax_u8(abd_p0p1_q0q1, abd_p1p2_q1q2);
const uint8x8_t b = vmax_u8(a, abd_p2p3_q2q3);
const uint8x8_t c = vcle_u8(b, vdup_n_u8(inner_thresh));
- const uint8x8_t inner_mask = vand_u8(c, RightShift<32>(c));
+ const uint8x8_t inner_mask = vand_u8(c, RightShiftVector<32>(c));
const uint8x8_t outer_mask = OuterThreshold(p0q0, p1q1, outer_thresh);
return vand_u8(inner_mask, outer_mask);
}
@@ -1179,7 +1179,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/loop_restoration_neon.cc b/libgav1/src/dsp/arm/loop_restoration_neon.cc
index 1e8dfb2..e6ceb66 100644
--- a/libgav1/src/dsp/arm/loop_restoration_neon.cc
+++ b/libgav1/src/dsp/arm/loop_restoration_neon.cc
@@ -18,6 +18,7 @@
#if LIBGAV1_ENABLE_NEON
#include <arm_neon.h>
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -40,10 +41,25 @@
}
template <int bytes>
+inline uint8x8_t VshrU128(const uint8x8_t src[2]) {
+ return vext_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
+inline uint8x16_t VshrU128(const uint8x16_t src[2]) {
+ return vextq_u8(src[0], src[1], bytes);
+}
+
+template <int bytes>
inline uint16x8_t VshrU128(const uint16x8x2_t src) {
return vextq_u16(src.val[0], src.val[1], bytes / 2);
}
+template <int bytes>
+inline uint16x8_t VshrU128(const uint16x8_t src[2]) {
+ return vextq_u16(src[0], src[1], bytes / 2);
+}
+
// Wiener
// Must make a local copy of coefficients to help compiler know that they have
@@ -115,8 +131,7 @@
const ptrdiff_t width, const int height,
const int16_t filter[4],
int16_t** const wiener_buffer) {
- int y = height;
- do {
+ for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
uint8x16_t s[8];
s[0] = vld1q_u8(src_ptr);
@@ -140,15 +155,14 @@
x -= 16;
} while (x != 0);
src += src_stride;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
const int16_t filter[4],
int16_t** const wiener_buffer) {
- int y = height;
- do {
+ for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
uint8x16_t s[6];
s[0] = vld1q_u8(src_ptr);
@@ -169,40 +183,37 @@
x -= 16;
} while (x != 0);
src += src_stride;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
const int16_t filter[4],
int16_t** const wiener_buffer) {
- int y = height;
- do {
+ for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
- uint8x16_t s[4];
- s[0] = vld1q_u8(src_ptr);
+ uint8x16_t s[3];
ptrdiff_t x = width;
do {
- src_ptr += 16;
- s[3] = vld1q_u8(src_ptr);
- s[1] = vextq_u8(s[0], s[3], 1);
- s[2] = vextq_u8(s[0], s[3], 2);
+ // Slightly faster than using vextq_u8().
+ s[0] = vld1q_u8(src_ptr);
+ s[1] = vld1q_u8(src_ptr + 1);
+ s[2] = vld1q_u8(src_ptr + 2);
int16x8x2_t sum;
sum.val[0] = sum.val[1] = vdupq_n_s16(0);
WienerHorizontalSum(s, filter, sum, *wiener_buffer);
- s[0] = s[3];
+ src_ptr += 16;
*wiener_buffer += 16;
x -= 16;
} while (x != 0);
src += src_stride;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
int16_t** const wiener_buffer) {
- int y = height;
- do {
+ for (int y = height; y != 0; --y) {
const uint8_t* src_ptr = src;
ptrdiff_t x = width;
do {
@@ -218,7 +229,7 @@
x -= 16;
} while (x != 0);
src += src_stride;
- } while (--y != 0);
+ }
}
inline int32x4x2_t WienerVertical2(const int16x8_t a0, const int16x8_t a1,
@@ -479,19 +490,19 @@
// For width 16 and up, store the horizontal results, and then do the vertical
// filter row by row. This is faster than doing it column by column when
// considering cache issues.
-void WienerFilter_NEON(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- const ptrdiff_t source_stride,
- const ptrdiff_t dest_stride, const int width,
- const int height, RestorationBuffer* const buffer) {
- constexpr int kCenterTap = kWienerFilterTaps / 2;
+void WienerFilter_NEON(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
1);
const ptrdiff_t wiener_stride = Align(width, 16);
- int16_t* const wiener_buffer_vertical = buffer->wiener_buffer;
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
// The values are saturated to 13 bits before storing.
int16_t* wiener_buffer_horizontal =
wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
@@ -506,25 +517,48 @@
// Over-reads up to 15 - |kRestorationHorizontalBorder| values.
const int height_horizontal =
height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
- const auto* const src = static_cast<const uint8_t*>(source) -
- (kCenterTap - number_rows_to_skip) * source_stride;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(src - 3, source_stride, wiener_stride,
- height_horizontal, filter_horizontal,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
&wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(src - 2, source_stride, wiener_stride,
- height_horizontal, filter_horizontal,
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
&wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(src - 1, source_stride, wiener_stride,
- height_horizontal, filter_horizontal,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal,
&wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(src, source_stride, wiener_stride, height_horizontal,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -536,52 +570,41 @@
// the top and bottom row of |wiener_buffer| accordingly.
memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
sizeof(*wiener_buffer_horizontal) * wiener_stride);
- memcpy(buffer->wiener_buffer, buffer->wiener_buffer + wiener_stride,
- sizeof(*buffer->wiener_buffer) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
- filter_vertical, dst, dest_stride);
+ filter_vertical, dst, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
- height, filter_vertical, dst, dest_stride);
+ height, filter_vertical, dst, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
- wiener_stride, height, filter_vertical, dst,
- dest_stride);
+ wiener_stride, height, filter_vertical, dst, stride);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
- wiener_stride, height, dst, dest_stride);
+ wiener_stride, height, dst, stride);
}
}
//------------------------------------------------------------------------------
// SGR
-template <int n>
-inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
- const uint32_t scale) {
- // a = |sum_sq|
- // d = |sum|
- // p = (a * n < d * d) ? 0 : a * n - d * d;
- const uint32x4_t dxd = vmull_u16(sum, sum);
- const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
- // Ensure |p| does not underflow by using saturating subtraction.
- const uint32x4_t p = vqsubq_u32(axn, dxd);
- // z = RightShiftWithRounding(p * scale, kSgrProjScaleBits);
- const uint32x4_t pxs = vmulq_n_u32(p, scale);
- // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
- // is 20.
- const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
- return vmovn_u32(shifted);
-}
-
-inline void Prepare3_8(const uint8x8x2_t src, uint8x8_t dst[3]) {
+inline void Prepare3_8(const uint8x8_t src[2], uint8x8_t dst[3]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
}
-inline void Prepare3_16(const uint16x8x2_t src, uint16x4_t low[3],
+template <int offset>
+inline void Prepare3_8(const uint8x16_t src[2], uint8x16_t dst[3]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+}
+
+inline void Prepare3_16(const uint16x8_t src[2], uint16x4_t low[3],
uint16x4_t high[3]) {
uint16x8_t s[3];
s[0] = VshrU128<0>(src);
@@ -595,7 +618,7 @@
high[2] = vget_high_u16(s[2]);
}
-inline void Prepare5_8(const uint8x8x2_t src, uint8x8_t dst[5]) {
+inline void Prepare5_8(const uint8x8_t src[2], uint8x8_t dst[5]) {
dst[0] = VshrU128<0>(src);
dst[1] = VshrU128<1>(src);
dst[2] = VshrU128<2>(src);
@@ -603,7 +626,16 @@
dst[4] = VshrU128<4>(src);
}
-inline void Prepare5_16(const uint16x8x2_t src, uint16x4_t low[5],
+template <int offset>
+inline void Prepare5_8(const uint8x16_t src[2], uint8x16_t dst[5]) {
+ dst[0] = VshrU128<offset + 0>(src);
+ dst[1] = VshrU128<offset + 1>(src);
+ dst[2] = VshrU128<offset + 2>(src);
+ dst[3] = VshrU128<offset + 3>(src);
+ dst[4] = VshrU128<offset + 4>(src);
+}
+
+inline void Prepare5_16(const uint16x8_t src[2], uint16x4_t low[5],
uint16x4_t high[5]) {
Prepare3_16(src, low, high);
const uint16x8_t s3 = VshrU128<6>(src);
@@ -642,6 +674,30 @@
return vaddw_u8(sum, src[2]);
}
+inline uint16x8_t Sum3WLo16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ return vaddw_u8(sum, vget_low_u8(src[2]));
+}
+
+inline uint16x8_t Sum3WHi16(const uint8x16_t src[3]) {
+ const uint16x8_t sum = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ return vaddw_u8(sum, vget_high_u8(src[2]));
+}
+
+inline uint16x8_t Sum5WLo16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_low_u8(src[4]));
+}
+
+inline uint16x8_t Sum5WHi16(const uint8x16_t src[5]) {
+ const uint16x8_t sum01 = vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[1]));
+ const uint16x8_t sum23 = vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[3]));
+ const uint16x8_t sum = vaddq_u16(sum01, sum23);
+ return vaddw_u8(sum, vget_high_u8(src[4]));
+}
+
inline uint32x4_t Sum3W_32(const uint16x4_t src[3]) {
const uint32x4_t sum = vaddl_u16(src[0], src[1]);
return vaddw_u16(sum, src[2]);
@@ -679,13 +735,28 @@
return vaddw_u16(sum0123, src[4]);
}
-inline uint16x8_t Sum3Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[3];
Prepare3_8(src, s);
return Sum3W_16(s);
}
-inline uint32x4x2_t Sum3WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum3Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum3Horizontal(s);
+}
+
+template <int offset>
+inline void Sum3Horizontal(const uint8x16_t src[2], uint16x8_t dst[2]) {
+ uint8x16_t s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline uint32x4x2_t Sum3WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t sum;
Prepare3_16(src, low, high);
@@ -694,7 +765,7 @@
return sum;
}
-inline uint16x8_t Sum5Horizontal(const uint8x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x8_t src[2]) {
uint8x8_t s[5];
Prepare5_8(src, s);
const uint16x8_t sum01 = vaddl_u8(s[0], s[1]);
@@ -703,7 +774,23 @@
return vaddw_u8(sum0123, s[4]);
}
-inline uint32x4x2_t Sum5WHorizontal(const uint16x8x2_t src) {
+inline uint16x8_t Sum5Horizontal(const uint8x16_t src) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return Sum5Horizontal(s);
+}
+
+template <int offset>
+inline void Sum5Horizontal(const uint8x16_t src[2], uint16x8_t* const dst0,
+ uint16x8_t* const dst1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline uint32x4x2_t Sum5WHorizontal(const uint16x8_t src[2]) {
uint16x4_t low[5], high[5];
Prepare5_16(src, low, high);
uint32x4x2_t sum;
@@ -712,6 +799,30 @@
return sum;
}
+template <int offset>
+void SumHorizontal(const uint8x16_t src[2], uint16x8_t* const row3_0,
+ uint16x8_t* const row3_1, uint16x8_t* const row5_0,
+ uint16x8_t* const row5_1) {
+ uint8x16_t s[5];
+ Prepare5_8<offset>(src, s);
+ const uint16x8_t sum04_lo = vaddl_u8(vget_low_u8(s[0]), vget_low_u8(s[4]));
+ const uint16x8_t sum04_hi = vaddl_u8(vget_high_u8(s[0]), vget_high_u8(s[4]));
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = vaddq_u16(sum04_lo, *row3_0);
+ *row5_1 = vaddq_u16(sum04_hi, *row3_1);
+}
+
+void SumHorizontal(const uint8x8_t src[2], uint16x8_t* const row3,
+ uint16x8_t* const row5) {
+ uint8x8_t s[5];
+ Prepare5_8(src, s);
+ const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
+ const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
+ *row3 = vaddw_u8(sum12, s[3]);
+ *row5 = vaddq_u16(sum04, *row3);
+}
+
void SumHorizontal(const uint16x4_t src[5], uint32x4_t* const row_sq3,
uint32x4_t* const row_sq5) {
const uint32x4_t sum04 = vaddl_u16(src[0], src[4]);
@@ -720,27 +831,36 @@
*row_sq5 = vaddq_u32(sum04, *row_sq3);
}
-void SumHorizontal(const uint8x8x2_t src, const uint16x8x2_t sq,
- uint16x8_t* const row3, uint16x8_t* const row5,
- uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
- uint8x8_t s[5];
- Prepare5_8(src, s);
- const uint16x8_t sum04 = vaddl_u8(s[0], s[4]);
- const uint16x8_t sum12 = vaddl_u8(s[1], s[2]);
- *row3 = vaddw_u8(sum12, s[3]);
- *row5 = vaddq_u16(sum04, *row3);
+void SumHorizontal(const uint16x8_t sq[2], uint32x4x2_t* const row_sq3,
+ uint32x4x2_t* const row_sq5) {
uint16x4_t low[5], high[5];
Prepare5_16(sq, low, high);
SumHorizontal(low, &row_sq3->val[0], &row_sq5->val[0]);
SumHorizontal(high, &row_sq3->val[1], &row_sq5->val[1]);
}
-inline uint16x8_t Sum343(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+void SumHorizontal(const uint8x8_t src[2], const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ SumHorizontal(src, row3, row5);
+ SumHorizontal(sq, row_sq3, row_sq5);
+}
+
+void SumHorizontal(const uint8x16_t src, const uint16x8_t sq[2],
+ uint16x8_t* const row3, uint16x8_t* const row5,
+ uint32x4x2_t* const row_sq3, uint32x4x2_t* const row_sq5) {
+ uint8x8_t s[2];
+ s[0] = vget_low_u8(src);
+ s[1] = vget_high_u8(src);
+ return SumHorizontal(s, sq, row3, row5, row_sq3, row_sq5);
+}
+
+template <int offset>
+inline uint16x8_t Sum343(const uint8x16_t ma3[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
const uint16x8_t sum3 = Sum3_16(sum, sum, sum);
- return vaddw_u8(sum3, s[1]);
+ return vaddw_u8(sum3,
+ (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
}
inline uint32x4_t Sum343W(const uint16x4_t src[3]) {
@@ -749,7 +869,7 @@
return vaddw_u16(sum3, src[1]);
}
-inline uint32x4x2_t Sum343W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum343W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -758,13 +878,13 @@
return d;
}
-inline uint16x8_t Sum565(const uint8x8x2_t src) {
- uint8x8_t s[3];
- Prepare3_8(src, s);
- const uint16x8_t sum = Sum3W_16(s);
+template <int offset>
+inline uint16x8_t Sum565(const uint8x16_t ma5[2]) {
+ const uint16x8_t sum = (offset == 0) ? Sum3WLo16(ma5) : Sum3WHi16(ma5);
const uint16x8_t sum4 = vshlq_n_u16(sum, 2);
const uint16x8_t sum5 = vaddq_u16(sum4, sum);
- return vaddw_u8(sum5, s[1]);
+ return vaddw_u8(sum5,
+ (offset == 0) ? vget_low_u8(ma5[1]) : vget_high_u8(ma5[1]));
}
inline uint32x4_t Sum565W(const uint16x4_t src[3]) {
@@ -774,7 +894,7 @@
return vaddw_u16(sum5, src[1]);
}
-inline uint32x4x2_t Sum565W(const uint16x8x2_t src) {
+inline uint32x4x2_t Sum565W(const uint16x8_t src[2]) {
uint16x4_t low[3], high[3];
uint32x4x2_t d;
Prepare3_16(src, low, high);
@@ -783,19 +903,203 @@
return d;
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t sum_stride, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
+ do {
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
+ ptrdiff_t x = 0;
+ do {
+ uint16x8_t row3, row5;
+ uint32x4x2_t row_sq3, row_sq5;
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
+ SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
+ vst1q_u16(sum3, row3);
+ vst1q_u16(sum5, row5);
+ vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
+ vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
+ vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
+ vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
+ s[0] = s[1];
+ sq[0] = sq[1];
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ x += 8;
+ } while (x < sum_stride);
+ src += src_stride;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t sum_stride, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int y = 2;
+ // Don't change loop width to 16, which is even slower.
+ do {
+ uint8x8_t s[2];
+ uint16x8_t sq[2];
+ s[0] = vld1_u8(src);
+ sq[0] = vmull_u8(s[0], s[0]);
+ ptrdiff_t x = 0;
+ do {
+ uint16x8_t row;
+ uint32x4x2_t row_sq;
+ s[1] = vld1_u8(src + x + 8);
+ sq[1] = vmull_u8(s[1], s[1]);
+ if (size == 3) {
+ row = Sum3Horizontal(s);
+ row_sq = Sum3WHorizontal(sq);
+ } else {
+ row = Sum5Horizontal(s);
+ row_sq = Sum5WHorizontal(sq);
+ }
+ vst1q_u16(sums, row);
+ vst1q_u32(square_sums + 0, row_sq.val[0]);
+ vst1q_u32(square_sums + 4, row_sq.val[1]);
+ s[0] = s[1];
+ sq[0] = sq[1];
+ sums += 8;
+ square_sums += 8;
+ x += 8;
+ } while (x < sum_stride);
+ src += src_stride;
+ } while (--y != 0);
+}
+
+template <int n>
+inline uint16x4_t CalculateMa(const uint16x4_t sum, const uint32x4_t sum_sq,
+ const uint32_t scale) {
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const uint32x4_t dxd = vmull_u16(sum, sum);
+ const uint32x4_t axn = vmulq_n_u32(sum_sq, n);
+ // Ensure |p| does not underflow by using saturating subtraction.
+ const uint32x4_t p = vqsubq_u32(axn, dxd);
+ const uint32x4_t pxs = vmulq_n_u32(p, scale);
+ // vrshrn_n_u32() (narrowing shift) can only shift by 16 and kSgrProjScaleBits
+ // is 20.
+ const uint32x4_t shifted = vrshrq_n_u32(pxs, kSgrProjScaleBits);
+ return vmovn_u32(shifted);
+}
+
+inline uint8x8_t AdjustValue(const uint8x8_t value, const uint8x8_t index,
+ const int threshold) {
+ const uint8x8_t thresholds = vdup_n_u8(threshold);
+ const uint8x8_t offset = vcgt_u8(index, thresholds);
+ // Adding 255 is equivalent to subtracting 1 for 8-bit data.
+ return vadd_u8(value, offset);
+}
+
+template <int n, int offset>
+inline void CalculateIntermediate(const uint16x8_t sum,
+ const uint32x4x2_t sum_sq,
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
+ const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
+ const uint16x4_t z1 =
+ CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
+ const uint16x8_t z01 = vcombine_u16(z0, z1);
+ const uint8x8_t idx = vqmovn_u16(z01);
+ // Use table lookup to read elements whose indices are less than 48.
+ // Using one uint8x8x4_t vector and one uint8x8x2_t vector is faster than
+ // using two uint8x8x3_t vectors.
+ uint8x8x4_t table0;
+ uint8x8x2_t table1;
+ table0.val[0] = vld1_u8(kSgrMaLookup + 0 * 8);
+ table0.val[1] = vld1_u8(kSgrMaLookup + 1 * 8);
+ table0.val[2] = vld1_u8(kSgrMaLookup + 2 * 8);
+ table0.val[3] = vld1_u8(kSgrMaLookup + 3 * 8);
+ table1.val[0] = vld1_u8(kSgrMaLookup + 4 * 8);
+ table1.val[1] = vld1_u8(kSgrMaLookup + 5 * 8);
+ // All elements whose indices are out of range [0, 47] are set to 0.
+ uint8x8_t val = vtbl4_u8(table0, idx); // Range [0, 31].
+ // Subtract 8 to shuffle the next index range.
+ const uint8x8_t index = vsub_u8(idx, vdup_n_u8(32));
+ const uint8x8_t res = vtbl2_u8(table1, index); // Range [32, 47].
+ // Use OR instruction to combine shuffle results together.
+ val = vorr_u8(val, res);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ val = vmax_u8(val, vdup_n_u8(5));
+ val = AdjustValue(val, idx, 55); // 55 is the last index which value is 5.
+ val = AdjustValue(val, idx, 72); // 72 is the last index which value is 4.
+ val = AdjustValue(val, idx, 101); // 101 is the last index which value is 3.
+ val = AdjustValue(val, idx, 169); // 169 is the last index which value is 2.
+ val = AdjustValue(val, idx, 254); // 254 is the last index which value is 1.
+ *ma = (offset == 0) ? vcombine_u8(val, vget_high_u8(*ma))
+ : vcombine_u8(vget_low_u8(*ma), val);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const uint16x8_t maq =
+ vmovl_u8((offset == 0) ? vget_low_u8(*ma) : vget_high_u8(*ma));
+ const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
+ const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
+ const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
+ const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
+ const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
+ const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
+ *b = vcombine_u16(b_lo, b_hi);
+}
+
+template <int offset>
+inline void CalculateIntermediate5(const uint16x8_t s5[5],
+ const uint32x4x2_t sq5[5],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum5_16(s5);
+ const uint32x4x2_t sum_sq = Sum5_32(sq5);
+ CalculateIntermediate<25, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void CalculateIntermediate3(const uint16x8_t s3[3],
+ const uint32x4x2_t sq3[3],
+ const uint32_t scale, uint8x16_t* const ma,
+ uint16x8_t* const b) {
+ const uint16x8_t sum = Sum3_16(s3);
+ const uint32x4x2_t sum_sq = Sum3_32(sq3);
+ CalculateIntermediate<9, offset>(sum, sum_sq, scale, ma, b);
+}
+
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint16x8_t* const sum_ma444,
uint32x4x2_t* const sum_b343,
uint32x4x2_t* const sum_b444, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
- uint8x8_t s[3];
- Prepare3_8(ma3, s);
- const uint16x8_t sum_ma111 = Sum3W_16(s);
+ const uint16x8_t sum_ma111 = (offset == 0) ? Sum3WLo16(ma3) : Sum3WHi16(ma3);
*sum_ma444 = vshlq_n_u16(sum_ma111, 2);
const uint16x8_t sum333 = vsubq_u16(*sum_ma444, sum_ma111);
- *sum_ma343 = vaddw_u8(sum333, s[1]);
+ *sum_ma343 = vaddw_u8(
+ sum333, (offset == 0) ? vget_low_u8(ma3[1]) : vget_high_u8(ma3[1]));
uint16x4_t low[3], high[3];
uint32x4x2_t sum_b111;
Prepare3_16(b3, low, high);
@@ -809,30 +1113,608 @@
sum_b343->val[1] = vaddw_u16(sum_b343->val[1], high[1]);
vst1q_u16(ma343 + x, *sum_ma343);
vst1q_u16(ma444 + x, *sum_ma444);
- vst1q_u32(b343 + x + 0, (*sum_b343).val[0]);
- vst1q_u32(b343 + x + 4, (*sum_b343).val[1]);
- vst1q_u32(b444 + x + 0, (*sum_b444).val[0]);
- vst1q_u32(b444 + x + 4, (*sum_b444).val[1]);
+ vst1q_u32(b343 + x + 0, sum_b343->val[0]);
+ vst1q_u32(b343 + x + 4, sum_b343->val[1]);
+ vst1q_u32(b444 + x + 0, sum_b444->val[0]);
+ vst1q_u32(b444 + x + 4, sum_b444->val[1]);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16x8_t* const sum_ma343,
uint32x4x2_t* const sum_b343, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma444;
uint32x4x2_t sum_b444;
- Store343_444(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444, ma343,
- ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, &sum_b444,
+ ma343, ma444, b343, b444);
}
-inline void Store343_444(const uint8x8x2_t ma3, const uint16x8x2_t b3,
+template <int offset>
+inline void Store343_444(const uint8x16_t ma3[3], const uint16x8_t b3[2],
const ptrdiff_t x, uint16_t* const ma343,
uint16_t* const ma444, uint32_t* const b343,
uint32_t* const b444) {
uint16x8_t sum_ma343;
uint32x4x2_t sum_b343;
- Store343_444(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343, b444);
+ Store343_444<offset>(ma3, b3, x, &sum_ma343, &sum_b343, ma343, ma444, b343,
+ b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const uint8_t* const src0, const uint8_t* const src1, const uint32_t scale,
+ uint8x16_t s[2][2], uint16_t* const sum5[5], uint32_t* const square_sum5[5],
+ uint16x8_t sq[2][4], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ s5[3] = Sum5Horizontal(s[0][0]);
+ s5[4] = Sum5Horizontal(s[1][0]);
+ sq5[3] = Sum5WHorizontal(sq[0]);
+ sq5[4] = Sum5WHorizontal(sq[1]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint32_t scale, uint8x16_t s[2][2], uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ sq5[3] = Sum5WHorizontal(sq[0] + 1);
+ sq5[4] = Sum5WHorizontal(sq[1] + 1);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
+ vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ sq5[3] = Sum5WHorizontal(sq[0] + 2);
+ sq5[4] = Sum5WHorizontal(sq[1] + 2);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ uint16x8_t sq[2], uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s5[5];
+ uint32x4x2_t sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s5[3] = s5[4] = Sum5Horizontal(*s);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+ uint8x16_t s[2], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], uint16x8_t sq[3], uint8x16_t ma[2],
+ uint16x8_t b[2]) {
+ uint16x8_t s5[2][5];
+ uint32x4x2_t sq5[5];
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq5[3] = sq5[4] = Sum5WHorizontal(sq + 1);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const uint8_t* const src, const uint32_t scale, uint8x16_t* const s,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16x8_t sq[2],
+ uint8x16_t* const ma, uint16x8_t* const b) {
+ uint16x8_t s3[3];
+ uint32x4x2_t sq3[3];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ s3[2] = Sum3Horizontal(*s);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint8x16_t s[2],
+ uint16x8_t sq[3], uint8x16_t ma[2], uint16x8_t b[2]) {
+ uint16x8_t s3[4];
+ uint32x4x2_t sq3[3];
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ Sum3Horizontal<8>(s, s3 + 2);
+ sq3[2] = Sum3WHorizontal(sq);
+ vst1q_u16(sum3[2] + x, s3[2]);
+ vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+ s3[0] = vld1q_u16(sum3[0] + x);
+ s3[1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ CalculateIntermediate3<8>(s3, sq3, scale, &ma[0], &b[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sq3[2] = Sum3WHorizontal(sq + 1);
+ vst1q_u16(sum3[2] + x + 8, s3[3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ s3[1] = vld1q_u16(sum3[0] + x + 8);
+ s3[2] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3 + 1, sq3, scale, &ma[1], &b[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t* const ma5, uint16x8_t* const b5) {
+ uint16x8_t s3[4], s5[5];
+ uint32x4x2_t sq3[4], sq5[5];
+ s[0][0] = vld1q_u8(src0);
+ s[1][0] = vld1q_u8(src1);
+ sq[0][0] = vmull_u8(vget_low_u8(s[0][0]), vget_low_u8(s[0][0]));
+ sq[1][0] = vmull_u8(vget_low_u8(s[1][0]), vget_low_u8(s[1][0]));
+ sq[0][1] = vmull_u8(vget_high_u8(s[0][0]), vget_high_u8(s[0][0]));
+ sq[1][1] = vmull_u8(vget_high_u8(s[1][0]), vget_high_u8(s[1][0]));
+ SumHorizontal(s[0][0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ SumHorizontal(s[1][0], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2], s3[2]);
+ vst1q_u16(sum3[3], s3[3]);
+ vst1q_u32(square_sum3[2] + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + 4, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + 0, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + 4, sq3[3].val[1]);
+ vst1q_u16(sum5[3], s5[3]);
+ vst1q_u16(sum5[4], s5[4]);
+ vst1q_u32(square_sum5[3] + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + 4, sq5[4].val[1]);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3[0], b3[0]);
+ CalculateIntermediate3<0>(s3 + 1, sq3 + 1, scales[1], ma3[1], b3[1]);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const ptrdiff_t x,
+ const uint16_t scales[2], uint8x16_t s[2][2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], uint16x8_t sq[2][4], uint8x16_t ma3[2][2],
+ uint16x8_t b3[2][3], uint8x16_t ma5[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][4], s5[2][5];
+ uint32x4x2_t sq3[4], sq5[5];
+ s[0][1] = vld1q_u8(src0 + x + 8);
+ s[1][1] = vld1q_u8(src1 + x + 8);
+ sq[0][2] = vmull_u8(vget_low_u8(s[0][1]), vget_low_u8(s[0][1]));
+ sq[1][2] = vmull_u8(vget_low_u8(s[1][1]), vget_low_u8(s[1][1]));
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ SumHorizontal(sq[0] + 1, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x, s3[0][2]);
+ vst1q_u16(sum3[3] + x, s3[0][3]);
+ vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
+ vst1q_u16(sum5[3] + x, s5[0][3]);
+ vst1q_u16(sum5[4] + x, s5[0][4]);
+ vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0][0], &b3[0][1]);
+ CalculateIntermediate3<8>(s3[0] + 1, sq3 + 1, scales[1], &ma3[1][0],
+ &b3[1][1]);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+
+ sq[0][3] = vmull_u8(vget_high_u8(s[0][1]), vget_high_u8(s[0][1]));
+ sq[1][3] = vmull_u8(vget_high_u8(s[1][1]), vget_high_u8(s[1][1]));
+ SumHorizontal(sq[0] + 2, &sq3[2], &sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3], &sq5[4]);
+ vst1q_u16(sum3[2] + x + 8, s3[1][2]);
+ vst1q_u16(sum3[3] + x + 8, s3[1][3]);
+ vst1q_u32(square_sum3[2] + x + 8, sq3[2].val[0]);
+ vst1q_u32(square_sum3[2] + x + 12, sq3[2].val[1]);
+ vst1q_u32(square_sum3[3] + x + 8, sq3[3].val[0]);
+ vst1q_u32(square_sum3[3] + x + 12, sq3[3].val[1]);
+ vst1q_u16(sum5[3] + x + 8, s5[1][3]);
+ vst1q_u16(sum5[4] + x + 8, s5[1][4]);
+ vst1q_u32(square_sum5[3] + x + 8, sq5[3].val[0]);
+ vst1q_u32(square_sum5[3] + x + 12, sq5[3].val[1]);
+ vst1q_u32(square_sum5[4] + x + 8, sq5[4].val[0]);
+ vst1q_u32(square_sum5[4] + x + 12, sq5[4].val[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[0][1], &b3[0][2]);
+ CalculateIntermediate3<0>(s3[1] + 1, sq3 + 1, scales[1], &ma3[1][1],
+ &b3[1][2]);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const uint8_t* const src, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint8x16_t* const s, uint16x8_t sq[2], uint8x16_t* const ma3,
+ uint8x16_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
+ uint16x8_t s3[3], s5[5];
+ uint32x4x2_t sq3[3], sq5[5];
+ *s = vld1q_u8(src);
+ sq[0] = vmull_u8(vget_low_u8(*s), vget_low_u8(*s));
+ sq[1] = vmull_u8(vget_high_u8(*s), vget_high_u8(*s));
+ SumHorizontal(*s, sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
+ s5[0] = vld1q_u16(sum5[0]);
+ s5[1] = vld1q_u16(sum5[1]);
+ s5[2] = vld1q_u16(sum5[2]);
+ s5[4] = s5[3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + 4);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ s3[0] = vld1q_u16(sum3[0]);
+ s3[1] = vld1q_u16(sum3[1]);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + 4);
+ CalculateIntermediate3<0>(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ uint8x16_t s[2], uint16x8_t sq[3], uint8x16_t ma3[2], uint8x16_t ma5[2],
+ uint16x8_t b3[2], uint16x8_t b5[2]) {
+ uint16x8_t s3[2][3], s5[2][5];
+ uint32x4x2_t sq3[3], sq5[5];
+ s[1] = vld1q_u8(src + x + 8);
+ sq[1] = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq, &sq3[2], &sq5[3]);
+ s5[0][0] = vld1q_u16(sum5[0] + x);
+ s5[0][1] = vld1q_u16(sum5[1] + x);
+ s5[0][2] = vld1q_u16(sum5[2] + x);
+ s5[0][4] = s5[0][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[0]);
+ s3[0][0] = vld1q_u16(sum3[0] + x);
+ s3[0][1] = vld1q_u16(sum3[1] + x);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
+ CalculateIntermediate3<8>(s3[0], sq3, scales[1], &ma3[0], &b3[0]);
+
+ sq[2] = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ SumHorizontal(sq + 1, &sq3[2], &sq5[3]);
+ s5[1][0] = vld1q_u16(sum5[0] + x + 8);
+ s5[1][1] = vld1q_u16(sum5[1] + x + 8);
+ s5[1][2] = vld1q_u16(sum5[2] + x + 8);
+ s5[1][4] = s5[1][3];
+ sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 8);
+ sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 12);
+ sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 8);
+ sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 12);
+ sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 8);
+ sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 12);
+ sq5[4] = sq5[3];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[1]);
+ s3[1][0] = vld1q_u16(sum3[0] + x + 8);
+ s3[1][1] = vld1q_u16(sum3[1] + x + 8);
+ sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 8);
+ sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 12);
+ sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 8);
+ sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 12);
+ CalculateIntermediate3<0>(s3[1], sq3, scales[1], &ma3[1], &b3[1]);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ uint16_t* ma565, uint32_t* b565) {
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t masx[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[0] = Sum565<0>(masx);
+ b[0] = Sum565W(bs);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
+ uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
+
+ int x = 0;
+ do {
+ uint8x16_t ma3x[3];
+ BoxFilterPreProcess3(src, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ if (calculate444) {
+ Store343_444<0>(ma3x, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444<8>(ma3x, bs + 1, 0, ma343 + 8, ma444 + 8, b343 + 8,
+ b444 + 8);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ uint16x8_t ma[2];
+ uint32x4x2_t b[2];
+ ma[0] = Sum343<0>(ma3x);
+ b[0] = Sum343W(bs);
+ vst1q_u16(ma343, ma[0]);
+ vst1q_u32(b343 + 0, b[0].val[0]);
+ vst1q_u32(b343 + 4, b[0].val[1]);
+ ma[1] = Sum343<8>(ma3x);
+ b[1] = Sum343W(bs + 1);
+ vst1q_u16(ma343 + 8, ma[1]);
+ vst1q_u32(b343 + 8, b[1].val[0]);
+ vst1q_u32(b343 + 12, b[1].val[1]);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343[4], uint16_t* const ma444, uint16_t* ma565,
+ uint32_t* const b343[4], uint32_t* const b444, uint32_t* b565) {
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ uint16x8_t ma[2];
+ uint8x16_t ma3x[3], ma5x[3];
+ uint32x4x2_t b[2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343<0>(ma3x);
+ ma[1] = Sum343<8>(ma3x);
+ b[0] = Sum343W(b3[0] + 0);
+ b[1] = Sum343W(b3[0] + 1);
+ vst1q_u16(ma343[0] + x, ma[0]);
+ vst1q_u16(ma343[0] + x + 8, ma[1]);
+ vst1q_u32(b343[0] + x, b[0].val[0]);
+ vst1q_u32(b343[0] + x + 4, b[0].val[1]);
+ vst1q_u32(b343[0] + x + 8, b[1].val[0]);
+ vst1q_u32(b343[0] + x + 12, b[1].val[1]);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444<0>(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444<8>(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565<0>(ma5x);
+ ma[1] = Sum565<8>(ma5x);
+ b[0] = Sum565W(b5);
+ b[1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565, ma[0]);
+ vst1q_u16(ma565 + 8, ma[1]);
+ vst1q_u32(b565 + 0, b[0].val[0]);
+ vst1q_u32(b565 + 4, b[0].val[1]);
+ vst1q_u32(b565 + 8, b[1].val[0]);
+ vst1q_u32(b565 + 12, b[1].val[1]);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ ma5[0] = ma5[1];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
}
template <int shift>
@@ -879,734 +1761,382 @@
return CalculateFilteredOutput<5>(s, ma_sum, b_sum);
}
-inline void SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2],
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedFinal(const uint8x8_t src, const int32x4_t v[2]) {
const int16x4_t v_lo =
vrshrn_n_s32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x4_t v_hi =
vrshrn_n_s32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const int16x8_t vv = vcombine_s16(v_lo, v_hi);
- const int16x8_t s = ZeroExtend(src);
- const int16x8_t d = vaddq_s16(s, vv);
- vst1_u8(dst, vqmovun_s16(d));
+ const int16x8_t d =
+ vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(vv), src));
+ return vqmovun_s16(d);
}
-inline void SelfGuidedDoubleMultiplier(const uint8x8_t src,
- const int16x8_t filter[2], const int w0,
- const int w2, uint8_t* const dst) {
+inline uint8x8_t SelfGuidedDoubleMultiplier(const uint8x8_t src,
+ const int16x8_t filter[2],
+ const int w0, const int w2) {
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter[0]), w0);
v[1] = vmull_n_s16(vget_high_s16(filter[0]), w0);
v[0] = vmlal_n_s16(v[0], vget_low_s16(filter[1]), w2);
v[1] = vmlal_n_s16(v[1], vget_high_s16(filter[1]), w2);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void SelfGuidedSingleMultiplier(const uint8x8_t src,
- const int16x8_t filter, const int w0,
- uint8_t* const dst) {
+inline uint8x8_t SelfGuidedSingleMultiplier(const uint8x8_t src,
+ const int16x8_t filter,
+ const int w0) {
// weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
int32x4_t v[2];
v[0] = vmull_n_s16(vget_low_s16(filter), w0);
v[1] = vmull_n_s16(vget_high_s16(filter), w0);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t width, uint16_t* sum3,
- uint16_t* sum5, uint32_t* square_sum3,
- uint32_t* square_sum5) {
- int y = height;
- do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- ptrdiff_t x = 0;
- do {
- uint16x8_t row3, row5;
- uint32x4x2_t row_sq3, row_sq5;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
- SumHorizontal(s, sq, &row3, &row5, &row_sq3, &row_sq5);
- vst1q_u16(sum3, row3);
- vst1q_u16(sum5, row5);
- vst1q_u32(square_sum3 + 0, row_sq3.val[0]);
- vst1q_u32(square_sum3 + 4, row_sq3.val[1]);
- vst1q_u32(square_sum5 + 0, row_sq5.val[0]);
- vst1q_u32(square_sum5 + 4, row_sq5.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- sum3 += 8;
- sum5 += 8;
- square_sum3 += 8;
- square_sum5 += 8;
- x += 8;
- } while (x < width);
- src += src_stride;
- } while (--y != 0);
-}
-
-template <int size>
-inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
- const int height, const ptrdiff_t width, uint16_t* sums,
- uint32_t* square_sums) {
- static_assert(size == 3 || size == 5, "");
- int y = height;
- do {
- uint8x8x2_t s;
- uint16x8x2_t sq;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- ptrdiff_t x = 0;
- do {
- uint16x8_t row;
- uint32x4x2_t row_sq;
- s.val[1] = vld1_u8(src + x + 8);
- sq.val[1] = vmull_u8(s.val[1], s.val[1]);
- if (size == 3) {
- row = Sum3Horizontal(s);
- row_sq = Sum3WHorizontal(sq);
- } else {
- row = Sum5Horizontal(s);
- row_sq = Sum5WHorizontal(sq);
- }
- vst1q_u16(sums, row);
- vst1q_u32(square_sums + 0, row_sq.val[0]);
- vst1q_u32(square_sums + 4, row_sq.val[1]);
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- sums += 8;
- square_sums += 8;
- x += 8;
- } while (x < width);
- src += src_stride;
- } while (--y != 0);
-}
-
-template <int n>
-inline void CalculateIntermediate(const uint16x8_t sum,
- const uint32x4x2_t sum_sq,
- const uint32_t scale, uint8x8_t* const ma,
- uint16x8_t* const b) {
- constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- const uint16x4_t z0 = CalculateMa<n>(vget_low_u16(sum), sum_sq.val[0], scale);
- const uint16x4_t z1 =
- CalculateMa<n>(vget_high_u16(sum), sum_sq.val[1], scale);
- const uint16x8_t z01 = vcombine_u16(z0, z1);
- // Using vqmovn_u16() needs an extra sign extension instruction.
- const uint16x8_t z = vminq_u16(z01, vdupq_n_u16(255));
- // Using vgetq_lane_s16() can save the sign extension instruction.
- const uint8_t lookup[8] = {
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 0)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 1)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 2)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 3)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 4)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 5)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 6)],
- kSgrMaLookup[vgetq_lane_s16(vreinterpretq_s16_u16(z), 7)]};
- *ma = vld1_u8(lookup);
- // b = ma * b * one_over_n
- // |ma| = [0, 255]
- // |sum| is a box sum with radius 1 or 2.
- // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
- // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
- // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
- // When radius is 2 |n| is 25. |one_over_n| is 164.
- // When radius is 1 |n| is 9. |one_over_n| is 455.
- // |kSgrProjReciprocalBits| is 12.
- // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
- // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
- const uint16x8_t maq = vmovl_u8(*ma);
- const uint32x4_t m0 = vmull_u16(vget_low_u16(maq), vget_low_u16(sum));
- const uint32x4_t m1 = vmull_u16(vget_high_u16(maq), vget_high_u16(sum));
- const uint32x4_t m2 = vmulq_n_u32(m0, one_over_n);
- const uint32x4_t m3 = vmulq_n_u32(m1, one_over_n);
- const uint16x4_t b_lo = vrshrn_n_u32(m2, kSgrProjReciprocalBits);
- const uint16x4_t b_hi = vrshrn_n_u32(m3, kSgrProjReciprocalBits);
- *b = vcombine_u16(b_lo, b_hi);
-}
-
-inline void CalculateIntermediate5(const uint16x8_t s5[5],
- const uint32x4x2_t sq5[5],
- const uint32_t scale, uint8x8_t* const ma,
- uint16x8_t* const b) {
- const uint16x8_t sum = Sum5_16(s5);
- const uint32x4x2_t sum_sq = Sum5_32(sq5);
- CalculateIntermediate<25>(sum, sum_sq, scale, ma, b);
-}
-
-inline void CalculateIntermediate3(const uint16x8_t s3[3],
- const uint32x4x2_t sq3[3],
- const uint32_t scale, uint8x8_t* const ma,
- uint16x8_t* const b) {
- const uint16x8_t sum = Sum3_16(s3);
- const uint32x4x2_t sum_sq = Sum3_32(sq3);
- CalculateIntermediate<9>(sum, sum_sq, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
- const uint8_t* const src, const ptrdiff_t src_stride, const ptrdiff_t x,
- const uint32_t scale, uint8x8x2_t s[2], uint16x8x2_t sq[2],
- uint16_t* const sum5[5], uint32_t* const square_sum5[5],
- uint8x8_t* const ma, uint16x8_t* const b) {
- uint16x8_t s5[5];
- uint32x4x2_t sq5[5];
- s[0].val[1] = vld1_u8(src + x + 8);
- s[1].val[1] = vld1_u8(src + src_stride + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- s5[3] = Sum5Horizontal(s[0]);
- s5[4] = Sum5Horizontal(s[1]);
- sq5[3] = Sum5WHorizontal(sq[0]);
- sq5[4] = Sum5WHorizontal(sq[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
- vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
- const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint16_t* const sum5[5],
- uint32_t* const square_sum5[5], uint8x8_t* const ma, uint16x8_t* const b) {
- uint16x8_t s5[5];
- uint32x4x2_t sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s5[3] = s5[4] = Sum5Horizontal(*s);
- sq5[3] = sq5[4] = Sum5WHorizontal(*sq);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate5(s5, sq5, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
- const uint8_t* const src, const ptrdiff_t x, const uint32_t scale,
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint16_t* const sum3[3],
- uint32_t* const square_sum3[3], uint8x8_t* const ma, uint16x8_t* const b) {
- uint16x8_t s3[3];
- uint32x4x2_t sq3[3];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- s3[2] = Sum3Horizontal(*s);
- sq3[2] = Sum3WHorizontal(*sq);
- vst1q_u16(sum3[2] + x, s3[2]);
- vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scale, ma, b);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
- const uint8_t* const src, const ptrdiff_t src_stride, const ptrdiff_t x,
- const uint16_t scales[2], uint8x8x2_t s[2], uint16x8x2_t sq[2],
- uint16_t* const sum3[4], uint16_t* const sum5[5],
- uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint8x8_t* const ma3_0, uint8x8_t* const ma3_1, uint16x8_t* const b3_0,
- uint16x8_t* const b3_1, uint8x8_t* const ma5, uint16x8_t* const b5) {
- uint16x8_t s3[4], s5[5];
- uint32x4x2_t sq3[4], sq5[5];
- s[0].val[1] = vld1_u8(src + x + 8);
- s[1].val[1] = vld1_u8(src + src_stride + x + 8);
- sq[0].val[1] = vmull_u8(s[0].val[1], s[0].val[1]);
- sq[1].val[1] = vmull_u8(s[1].val[1], s[1].val[1]);
- SumHorizontal(s[0], sq[0], &s3[2], &s5[3], &sq3[2], &sq5[3]);
- SumHorizontal(s[1], sq[1], &s3[3], &s5[4], &sq3[3], &sq5[4]);
- vst1q_u16(sum3[2] + x, s3[2]);
- vst1q_u16(sum3[3] + x, s3[3]);
- vst1q_u32(square_sum3[2] + x + 0, sq3[2].val[0]);
- vst1q_u32(square_sum3[2] + x + 4, sq3[2].val[1]);
- vst1q_u32(square_sum3[3] + x + 0, sq3[3].val[0]);
- vst1q_u32(square_sum3[3] + x + 4, sq3[3].val[1]);
- vst1q_u16(sum5[3] + x, s5[3]);
- vst1q_u16(sum5[4] + x, s5[4]);
- vst1q_u32(square_sum5[3] + x + 0, sq5[3].val[0]);
- vst1q_u32(square_sum5[3] + x + 4, sq5[3].val[1]);
- vst1q_u32(square_sum5[4] + x + 0, sq5[4].val[0]);
- vst1q_u32(square_sum5[4] + x + 4, sq5[4].val[1]);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3_0, b3_0);
- CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], ma3_1, b3_1);
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
-}
-
-LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
- const uint8_t* const src, const ptrdiff_t x, const uint16_t scales[2],
- const uint16_t* const sum3[4], const uint16_t* const sum5[5],
- const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
- uint8x8x2_t* const s, uint16x8x2_t* const sq, uint8x8_t* const ma3,
- uint8x8_t* const ma5, uint16x8_t* const b3, uint16x8_t* const b5) {
- uint16x8_t s3[3], s5[5];
- uint32x4x2_t sq3[3], sq5[5];
- s->val[1] = vld1_u8(src + x + 8);
- sq->val[1] = vmull_u8(s->val[1], s->val[1]);
- SumHorizontal(*s, *sq, &s3[2], &s5[3], &sq3[2], &sq5[3]);
- s5[0] = vld1q_u16(sum5[0] + x);
- s5[1] = vld1q_u16(sum5[1] + x);
- s5[2] = vld1q_u16(sum5[2] + x);
- s5[4] = s5[3];
- sq5[0].val[0] = vld1q_u32(square_sum5[0] + x + 0);
- sq5[0].val[1] = vld1q_u32(square_sum5[0] + x + 4);
- sq5[1].val[0] = vld1q_u32(square_sum5[1] + x + 0);
- sq5[1].val[1] = vld1q_u32(square_sum5[1] + x + 4);
- sq5[2].val[0] = vld1q_u32(square_sum5[2] + x + 0);
- sq5[2].val[1] = vld1q_u32(square_sum5[2] + x + 4);
- sq5[4] = sq5[3];
- CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
- s3[0] = vld1q_u16(sum3[0] + x);
- s3[1] = vld1q_u16(sum3[1] + x);
- sq3[0].val[0] = vld1q_u32(square_sum3[0] + x + 0);
- sq3[0].val[1] = vld1q_u32(square_sum3[0] + x + 4);
- sq3[1].val[0] = vld1q_u32(square_sum3[1] + x + 0);
- sq3[1].val[1] = vld1q_u32(square_sum3[1] + x + 4);
- CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
-}
-
-inline void BoxSumFilterPreProcess5(const uint8_t* const src,
- const ptrdiff_t src_stride, const int width,
- const uint32_t scale,
- uint16_t* const sum5[5],
- uint32_t* const square_sum5[5],
- uint16_t* ma565, uint32_t* b565) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- s[1].val[0] = vld1_u8(src + src_stride);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src, src_stride, 0, scale, s, sq, sum5, square_sum5,
- &mas.val[0], &bs.val[0]);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const uint32_t scale,
+ const int16_t w0, uint16_t* const ma565[2], uint32_t* const b565[2],
+ uint8_t* const dst) {
+ uint8x16_t s[2][2], mas[2];
+ uint16x8_t sq[2][4], bs[3];
+ BoxFilterPreProcess5Lo(src0, src1, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src, src_stride, x + 8, scale, s, sq, sum5,
- square_sum5, &mas.val[1], &bs.val[1]);
- const uint16x8_t ma = Sum565(mas);
- const uint32x4x2_t b = Sum565W(bs);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
- } while (x < width);
-}
-
-template <bool calculate444>
-LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
- const uint8_t* const src, const int width, const uint32_t scale,
- uint16_t* const sum3[3], uint32_t* const square_sum3[3], uint16_t* ma343,
- uint16_t* ma444, uint32_t* b343, uint32_t* b444) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src, 0, scale, &s, &sq, sum3, square_sum3, &mas.val[0],
- &bs.val[0]);
-
- int x = 0;
- do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src, x + 8, scale, &s, &sq, sum3, square_sum3,
- &mas.val[1], &bs.val[1]);
- if (calculate444) {
- Store343_444(mas, bs, 0, ma343, ma444, b343, b444);
- ma444 += 8;
- b444 += 8;
- } else {
- const uint16x8_t ma = Sum343(mas);
- const uint32x4x2_t b = Sum343W(bs);
- vst1q_u16(ma343, ma);
- vst1q_u32(b343 + 0, b.val[0]);
- vst1q_u32(b343 + 4, b.val[1]);
- }
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- ma343 += 8;
- b343 += 8;
- x += 8;
- } while (x < width);
-}
-
-inline void BoxSumFilterPreProcess(
- const uint8_t* const src, const ptrdiff_t src_stride, const int width,
- const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
- uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
- uint16_t* const ma343[4], uint16_t* const ma444[2], uint16_t* ma565,
- uint32_t* const b343[4], uint32_t* const b444[2], uint32_t* b565) {
- uint8x8x2_t s[2];
- uint8x8x2_t ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src + 0);
- s[1].val[0] = vld1_u8(src + src_stride + 0);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src, src_stride, 0, scales, s, sq, sum3, sum5,
- square_sum3, square_sum5, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
-
- int x = 0;
- do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src, src_stride, x + 8, scales, s, sq, sum3, sum5,
- square_sum3, square_sum5, &ma3[0].val[1],
- &ma3[1].val[1], &b3[0].val[1], &b3[1].val[1],
- &ma5.val[1], &b5.val[1]);
- uint16x8_t ma = Sum343(ma3[0]);
- uint32x4x2_t b = Sum343W(b3[0]);
- vst1q_u16(ma343[0] + x, ma);
- vst1q_u32(b343[0] + x, b.val[0]);
- vst1q_u32(b343[0] + x + 4, b.val[1]);
- Store343_444(ma3[1], b3[1], x, ma343[1], ma444[0], b343[1], b444[0]);
- ma = Sum565(ma5);
- b = Sum565W(b5);
- vst1q_u16(ma565, ma);
- vst1q_u32(b565 + 0, b.val[0]);
- vst1q_u32(b565 + 4, b.val[1]);
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma565 += 8;
- b565 += 8;
- x += 8;
- } while (x < width);
-}
-
-inline void BoxFilterPass1(const uint8_t* const src0, const uint8_t* const src,
- const ptrdiff_t src_stride, uint16_t* const sum5[5],
- uint32_t* const square_sum5[5], const int width,
- const uint32_t scale, const int16_t w0,
- uint16_t* const ma565[2], uint32_t* const b565[2],
- uint8_t* const dst, const ptrdiff_t dst_stride) {
- uint8x8x2_t s[2], mas;
- uint16x8x2_t sq[2], bs;
- s[0].val[0] = vld1_u8(src);
- s[1].val[0] = vld1_u8(src + src_stride);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess5(src, src_stride, 0, scale, s, sq, sum5, square_sum5,
- &mas.val[0], &bs.val[0]);
-
- int x = 0;
- do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess5(src, src_stride, x + 8, scale, s, sq, sum5,
- square_sum5, &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ int16x8_t p0, p1;
+ BoxFilterPreProcess5(src0, src1, x + 8, scale, s, sum5, square_sum5, sq,
+ mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
vst1q_u16(ma565[1] + x, ma[1]);
vst1q_u32(b565[1] + x + 0, b[1].val[0]);
vst1q_u32(b565[1] + x + 4, b[1].val[1]);
- const uint8x8_t s0 = vld1_u8(src0 + x);
- const uint8x8_t s1 = vld1_u8(src0 + src_stride + x);
- int16x8_t p0, p1;
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
ma[0] = vld1q_u16(ma565[0] + x);
b[0].val[0] = vld1q_u32(b565[0] + x + 0);
b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p0 = CalculateFilteredOutputPass1(s0, ma, b);
- p1 = CalculateFilteredOutput<4>(s1, ma[1], b[1]);
- SelfGuidedSingleMultiplier(s0, p0, w0, dst + x);
- SelfGuidedSingleMultiplier(s1, p1, w0, dst + dst_stride + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ p0 = CalculateFilteredOutputPass1(sr00, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr10, ma[1], b[1]);
+ const uint8x8_t d00 = SelfGuidedSingleMultiplier(sr00, p0, w0);
+ const uint8x8_t d10 = SelfGuidedSingleMultiplier(sr10, p1, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[1]);
+ vst1q_u32(b565[1] + x + 8, b[1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[1].val[1]);
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0] = vld1q_u16(ma565[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p0 = CalculateFilteredOutputPass1(sr01, ma, b);
+ p1 = CalculateFilteredOutput<4>(sr11, ma[1], b[1]);
+ const uint8x8_t d01 = SelfGuidedSingleMultiplier(sr01, p0, w0);
+ const uint8x8_t d11 = SelfGuidedSingleMultiplier(sr11, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
-inline void BoxFilterPass1LastRow(const uint8_t* const src0,
- const uint8_t* const src, const int width,
+inline void BoxFilterPass1LastRow(const uint8_t* const src,
+ const uint8_t* const src0, const int width,
const uint32_t scale, const int16_t w0,
uint16_t* const sum5[5],
uint32_t* const square_sum5[5],
uint16_t* ma565, uint32_t* b565,
uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess5LastRow(src, 0, scale, &s, &sq, sum5, square_sum5,
- &mas.val[0], &bs.val[0]);
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[4];
+ BoxFilterPreProcess5LastRowLo(src0, scale, s, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess5LastRow(src, x + 8, scale, &s, &sq, sum5, square_sum5,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[2];
+ uint8x16_t masx[3];
uint32x4x2_t b[2];
- ma[1] = Sum565(mas);
+ BoxFilterPreProcess5LastRow(src0, x + 8, scale, s, sum5, square_sum5,
+ sq + 1, mas, bs + 1);
+ Prepare3_8<0>(mas, masx);
+ ma[1] = Sum565<0>(masx);
b[1] = Sum565W(bs);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
ma[0] = vld1q_u16(ma565);
b[0].val[0] = vld1q_u32(b565 + 0);
b[0].val[1] = vld1q_u32(b565 + 4);
- const uint8x8_t s = vld1_u8(src0 + x);
- const int16x8_t p = CalculateFilteredOutputPass1(s, ma, b);
- SelfGuidedSingleMultiplier(s, p, w0, dst + x);
- ma565 += 8;
- b565 += 8;
- x += 8;
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ const int16x8_t p0 = CalculateFilteredOutputPass1(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ ma[1] = Sum565<8>(masx);
+ b[1] = Sum565W(bs + 1);
+ bs[0] = bs[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + 8);
+ b[0].val[0] = vld1q_u32(b565 + 8);
+ b[0].val[1] = vld1q_u32(b565 + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass1(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
-inline void BoxFilterPass2(const uint8_t* const src0, const uint8_t* const src,
- const int width, const uint32_t scale,
- const int16_t w0, uint16_t* const sum3[3],
- uint32_t* const square_sum3[3],
- uint16_t* const ma343[3], uint16_t* const ma444[2],
- uint32_t* const b343[3], uint32_t* const b444[2],
- uint8_t* const dst) {
- uint8x8x2_t s, mas;
- uint16x8x2_t sq, bs;
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcess3(src, 0, scale, &s, &sq, sum3, square_sum3, &mas.val[0],
- &bs.val[0]);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint32_t scale, const int16_t w0, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], uint16_t* const ma343[3],
+ uint16_t* const ma444[2], uint32_t* const b343[3], uint32_t* const b444[2],
+ uint8_t* const dst) {
+ uint8x16_t s[2], mas[2];
+ uint16x8_t sq[4], bs[3];
+ BoxFilterPreProcess3Lo(src0, scale, &s[0], sum3, square_sum3, sq, &mas[0],
+ &bs[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcess3(src, x + 8, scale, &s, &sq, sum3, square_sum3,
- &mas.val[1], &bs.val[1]);
uint16x8_t ma[3];
+ uint8x16_t ma3x[3];
uint32x4x2_t b[3];
- Store343_444(mas, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
- b444[1]);
- const uint8x8_t s0 = vld1_u8(src0 + x);
+ BoxFilterPreProcess3(src0, x + 8, scale, sum3, square_sum3, s, sq + 1, mas,
+ bs + 1);
+ Prepare3_8<0>(mas, ma3x);
+ Store343_444<0>(ma3x, bs, x, &ma[2], &b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
ma[0] = vld1q_u16(ma343[0] + x);
ma[1] = vld1q_u16(ma444[0] + x);
b[0].val[0] = vld1q_u32(b343[0] + x + 0);
b[0].val[1] = vld1q_u32(b343[0] + x + 4);
b[1].val[0] = vld1q_u32(b444[0] + x + 0);
b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- const int16x8_t p = CalculateFilteredOutputPass2(s0, ma, b);
- SelfGuidedSingleMultiplier(s0, p, w0, dst + x);
- mas.val[0] = mas.val[1];
- bs.val[0] = bs.val[1];
- x += 8;
+ const int16x8_t p0 = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedSingleMultiplier(sr0, p0, w0);
+
+ Store343_444<8>(ma3x, bs + 1, x + 8, &ma[2], &b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1] = vld1q_u16(ma444[0] + x + 8);
+ b[0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1].val[1] = vld1q_u32(b444[0] + x + 12);
+ const int16x8_t p1 = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedSingleMultiplier(sr1, p1, w0);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
-inline void BoxFilter(const uint8_t* const src0, const uint8_t* const src,
- const ptrdiff_t src_stride, const int width,
- const uint16_t scales[2], const int16_t w0,
- const int16_t w2, uint16_t* const sum3[4],
- uint16_t* const sum5[5], uint32_t* const square_sum3[4],
- uint32_t* const square_sum5[5], uint16_t* const ma343[4],
- uint16_t* const ma444[3], uint16_t* const ma565[2],
- uint32_t* const b343[4], uint32_t* const b444[3],
- uint32_t* const b565[2], uint8_t* const dst,
- const ptrdiff_t dst_stride) {
- uint8x8x2_t s[2], ma3[2], ma5;
- uint16x8x2_t sq[2], b3[2], b5;
- s[0].val[0] = vld1_u8(src);
- s[1].val[0] = vld1_u8(src + src_stride);
- sq[0].val[0] = vmull_u8(s[0].val[0], s[0].val[0]);
- sq[1].val[0] = vmull_u8(s[1].val[0], s[1].val[0]);
- BoxFilterPreProcess(src, src_stride, 0, scales, s, sq, sum3, sum5,
- square_sum3, square_sum5, &ma3[0].val[0], &ma3[1].val[0],
- &b3[0].val[0], &b3[1].val[0], &ma5.val[0], &b5.val[0]);
-
- int x = 0;
- do {
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- BoxFilterPreProcess(src, src_stride, x + 8, scales, s, sq, sum3, sum5,
- square_sum3, square_sum5, &ma3[0].val[1],
- &ma3[1].val[1], &b3[0].val[1], &b3[1].val[1],
- &ma5.val[1], &b5.val[1]);
- uint16x8_t ma[3][3];
- uint32x4x2_t b[3][3];
- Store343_444(ma3[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
- ma343[2], ma444[1], b343[2], b444[1]);
- Store343_444(ma3[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
- b343[3], b444[2]);
- ma[0][1] = Sum565(ma5);
- b[0][1] = Sum565W(b5);
- vst1q_u16(ma565[1] + x, ma[0][1]);
- vst1q_u32(b565[1] + x, b[0][1].val[0]);
- vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
- s[0].val[0] = s[0].val[1];
- s[1].val[0] = s[1].val[1];
- sq[0].val[0] = sq[0].val[1];
- sq[1].val[0] = sq[1].val[1];
- ma3[0].val[0] = ma3[0].val[1];
- ma3[1].val[0] = ma3[1].val[1];
- b3[0].val[0] = b3[0].val[1];
- b3[1].val[0] = b3[1].val[1];
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- int16x8_t p[2][2];
- const uint8x8_t s0 = vld1_u8(src0 + x);
- const uint8x8_t s1 = vld1_u8(src0 + src_stride + x);
- ma[0][0] = vld1q_u16(ma565[0] + x);
- b[0][0].val[0] = vld1q_u32(b565[0] + x);
- b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0][0] = CalculateFilteredOutputPass1(s0, ma[0], b[0]);
- p[1][0] = CalculateFilteredOutput<4>(s1, ma[0][1], b[0][1]);
- ma[1][0] = vld1q_u16(ma343[0] + x);
- ma[1][1] = vld1q_u16(ma444[0] + x);
- b[1][0].val[0] = vld1q_u32(b343[0] + x);
- b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
- b[1][1].val[0] = vld1q_u32(b444[0] + x);
- b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[0][1] = CalculateFilteredOutputPass2(s0, ma[1], b[1]);
- ma[2][0] = vld1q_u16(ma343[1] + x);
- b[2][0].val[0] = vld1q_u32(b343[1] + x);
- b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
- p[1][1] = CalculateFilteredOutputPass2(s1, ma[2], b[2]);
- SelfGuidedDoubleMultiplier(s0, p[0], w0, w2, dst + x);
- SelfGuidedDoubleMultiplier(s1, p[1], w0, w2, dst + dst_stride + x);
- x += 8;
- } while (x < width);
-}
-
-inline void BoxFilterLastRow(
- const uint8_t* const src0, const uint8_t* const src, const int width,
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
const uint16_t scales[2], const int16_t w0, const int16_t w2,
uint16_t* const sum3[4], uint16_t* const sum5[5],
uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
uint16_t* const ma343[4], uint16_t* const ma444[3],
uint16_t* const ma565[2], uint32_t* const b343[4], uint32_t* const b444[3],
uint32_t* const b565[2], uint8_t* const dst) {
- uint8x8x2_t s, ma3, ma5;
- uint16x8x2_t sq, b3, b5;
- uint16x8_t ma[3];
- uint32x4x2_t b[3];
- s.val[0] = vld1_u8(src);
- sq.val[0] = vmull_u8(s.val[0], s.val[0]);
- BoxFilterPreProcessLastRow(src, 0, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[0], &ma5.val[0],
- &b3.val[0], &b5.val[0]);
+ uint8x16_t s[2][2], ma3[2][2], ma5[2];
+ uint16x8_t sq[2][4], b3[2][3], b5[3];
+ BoxFilterPreProcessLo(src0, src1, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, &ma5[0], &b5[0]);
int x = 0;
do {
- s.val[0] = s.val[1];
- sq.val[0] = sq.val[1];
- BoxFilterPreProcessLastRow(src, x + 8, scales, sum3, sum5, square_sum3,
- square_sum5, &s, &sq, &ma3.val[1], &ma5.val[1],
- &b3.val[1], &b5.val[1]);
- ma[1] = Sum565(ma5);
- b[1] = Sum565W(b5);
- ma5.val[0] = ma5.val[1];
- b5.val[0] = b5.val[1];
- ma[2] = Sum343(ma3);
- b[2] = Sum343W(b3);
- ma3.val[0] = ma3.val[1];
- b3.val[0] = b3.val[1];
- const uint8x8_t s0 = vld1_u8(src0 + x);
- int16x8_t p[2];
- ma[0] = vld1q_u16(ma565[0] + x);
- b[0].val[0] = vld1q_u32(b565[0] + x + 0);
- b[0].val[1] = vld1q_u32(b565[0] + x + 4);
- p[0] = CalculateFilteredOutputPass1(s0, ma, b);
- ma[0] = vld1q_u16(ma343[0] + x);
- ma[1] = vld1q_u16(ma444[0] + x);
- b[0].val[0] = vld1q_u32(b343[0] + x + 0);
- b[0].val[1] = vld1q_u32(b343[0] + x + 4);
- b[1].val[0] = vld1q_u32(b444[0] + x + 0);
- b[1].val[1] = vld1q_u32(b444[0] + x + 4);
- p[1] = CalculateFilteredOutputPass2(s0, ma, b);
- SelfGuidedDoubleMultiplier(s0, p, w0, w2, dst + x);
- x += 8;
+ uint16x8_t ma[3][3];
+ uint8x16_t ma3x[2][3], ma5x[3];
+ uint32x4x2_t b[3][3];
+ int16x8_t p[2][2];
+ BoxFilterPreProcess(src0, src1, x + 8, scales, s, sum3, sum5, square_sum3,
+ square_sum5, sq, ma3, b3, ma5, b5 + 1);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Store343_444<0>(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], &b[1][2], &b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<0>(ma3x[1], b3[1], x, &ma[2][2], &b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0][1] = Sum565<0>(ma5x);
+ b[0][1] = Sum565W(b5);
+ vst1q_u16(ma565[1] + x, ma[0][1]);
+ vst1q_u32(b565[1] + x, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 4, b[0][1].val[1]);
+ const uint8x16_t sr0 = vld1q_u8(src + x);
+ const uint8x16_t sr1 = vld1q_u8(src + stride + x);
+ const uint8x8_t sr00 = vget_low_u8(sr0);
+ const uint8x8_t sr10 = vget_low_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 4);
+ p[0][0] = CalculateFilteredOutputPass1(sr00, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr10, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x);
+ ma[1][1] = vld1q_u16(ma444[0] + x);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 4);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 4);
+ p[0][1] = CalculateFilteredOutputPass2(sr00, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 4);
+ p[1][1] = CalculateFilteredOutputPass2(sr10, ma[2], b[2]);
+ const uint8x8_t d00 = SelfGuidedDoubleMultiplier(sr00, p[0], w0, w2);
+ const uint8x8_t d10 = SelfGuidedDoubleMultiplier(sr10, p[1], w0, w2);
+
+ Store343_444<8>(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], &b[1][2],
+ &b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444<8>(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], &b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565<8>(ma5x);
+ b[0][1] = Sum565W(b5 + 1);
+ vst1q_u16(ma565[1] + x + 8, ma[0][1]);
+ vst1q_u32(b565[1] + x + 8, b[0][1].val[0]);
+ vst1q_u32(b565[1] + x + 12, b[0][1].val[1]);
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ const uint8x8_t sr01 = vget_high_u8(sr0);
+ const uint8x8_t sr11 = vget_high_u8(sr1);
+ ma[0][0] = vld1q_u16(ma565[0] + x + 8);
+ b[0][0].val[0] = vld1q_u32(b565[0] + x + 8);
+ b[0][0].val[1] = vld1q_u32(b565[0] + x + 12);
+ p[0][0] = CalculateFilteredOutputPass1(sr01, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr11, ma[0][1], b[0][1]);
+ ma[1][0] = vld1q_u16(ma343[0] + x + 8);
+ ma[1][1] = vld1q_u16(ma444[0] + x + 8);
+ b[1][0].val[0] = vld1q_u32(b343[0] + x + 8);
+ b[1][0].val[1] = vld1q_u32(b343[0] + x + 12);
+ b[1][1].val[0] = vld1q_u32(b444[0] + x + 8);
+ b[1][1].val[1] = vld1q_u32(b444[0] + x + 12);
+ p[0][1] = CalculateFilteredOutputPass2(sr01, ma[1], b[1]);
+ ma[2][0] = vld1q_u16(ma343[1] + x + 8);
+ b[2][0].val[0] = vld1q_u32(b343[1] + x + 8);
+ b[2][0].val[1] = vld1q_u32(b343[1] + x + 12);
+ p[1][1] = CalculateFilteredOutputPass2(sr11, ma[2], b[2]);
+ const uint8x8_t d01 = SelfGuidedDoubleMultiplier(sr01, p[0], w0, w2);
+ const uint8x8_t d11 = SelfGuidedDoubleMultiplier(sr11, p[1], w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d00, d01));
+ vst1q_u8(dst + stride + x, vcombine_u8(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ x += 16;
} while (x < width);
}
-template <typename T>
-void Circulate3PointersBy1(T* p[3]) {
- T* const p0 = p[0];
- p[0] = p[1];
- p[1] = p[2];
- p[2] = p0;
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ uint8x16_t s[2], ma3[2], ma5[2];
+ uint16x8_t sq[4], ma[3], b3[3], b5[3];
+ uint32x4x2_t b[3];
+ BoxFilterPreProcessLastRowLo(src0, scales, sum3, sum5, square_sum3,
+ square_sum5, &s[0], sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
+
+ int x = 0;
+ do {
+ uint8x16_t ma3x[3], ma5x[3];
+ int16x8_t p[2];
+ BoxFilterPreProcessLastRow(src0, x + 8, scales, sum3, sum5, square_sum3,
+ square_sum5, s, sq + 1, ma3, ma5, &b3[1],
+ &b5[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565<0>(ma5x);
+ b[1] = Sum565W(b5);
+ Prepare3_8<0>(ma3, ma3x);
+ ma[2] = Sum343<0>(ma3x);
+ b[2] = Sum343W(b3);
+ const uint8x16_t sr = vld1q_u8(src + x);
+ const uint8x8_t sr0 = vget_low_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x);
+ b[0].val[0] = vld1q_u32(b565 + x + 0);
+ b[0].val[1] = vld1q_u32(b565 + x + 4);
+ p[0] = CalculateFilteredOutputPass1(sr0, ma, b);
+ ma[0] = vld1q_u16(ma343 + x);
+ ma[1] = vld1q_u16(ma444 + x);
+ b[0].val[0] = vld1q_u32(b343 + x + 0);
+ b[0].val[1] = vld1q_u32(b343 + x + 4);
+ b[1].val[0] = vld1q_u32(b444 + x + 0);
+ b[1].val[1] = vld1q_u32(b444 + x + 4);
+ p[1] = CalculateFilteredOutputPass2(sr0, ma, b);
+ const uint8x8_t d0 = SelfGuidedDoubleMultiplier(sr0, p, w0, w2);
+
+ ma[1] = Sum565<8>(ma5x);
+ b[1] = Sum565W(b5 + 1);
+ b5[0] = b5[2];
+ ma[2] = Sum343<8>(ma3x);
+ b[2] = Sum343W(b3 + 1);
+ b3[0] = b3[2];
+ const uint8x8_t sr1 = vget_high_u8(sr);
+ ma[0] = vld1q_u16(ma565 + x + 8);
+ b[0].val[0] = vld1q_u32(b565 + x + 8);
+ b[0].val[1] = vld1q_u32(b565 + x + 12);
+ p[0] = CalculateFilteredOutputPass1(sr1, ma, b);
+ ma[0] = vld1q_u16(ma343 + x + 8);
+ ma[1] = vld1q_u16(ma444 + x + 8);
+ b[0].val[0] = vld1q_u32(b343 + x + 8);
+ b[0].val[1] = vld1q_u32(b343 + x + 12);
+ b[1].val[0] = vld1q_u32(b444 + x + 8);
+ b[1].val[1] = vld1q_u32(b444 + x + 12);
+ p[1] = CalculateFilteredOutputPass2(sr1, ma, b);
+ const uint8x8_t d1 = SelfGuidedDoubleMultiplier(sr1, p, w0, w2);
+ vst1q_u8(dst + x, vcombine_u8(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ x += 16;
+ } while (x < width);
}
-template <typename T>
-void Circulate4PointersBy2(T* p[4]) {
- std::swap(p[0], p[2]);
- std::swap(p[1], p[3]);
-}
-
-template <typename T>
-void Circulate5PointersBy2(T* p[5]) {
- T* const p0 = p[0];
- T* const p1 = p[1];
- p[0] = p[2];
- p[1] = p[3];
- p[2] = p[4];
- p[3] = p0;
- p[4] = p1;
-}
-
-inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
- const uint8_t* src, const ptrdiff_t src_stride,
- const int width, const int height,
- SgrBuffer* const sgr_buffer, uint8_t* dst,
- const ptrdiff_t dst_stride) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
@@ -1643,25 +2173,27 @@
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum(src - 2 * src_stride - 3, src_stride, 2, sum_stride, sum3[0], sum5[1],
+ BoxSum(top_border, top_border_stride, sum_stride, sum3[0], sum5[1],
square_sum3[0], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
- BoxSumFilterPreProcess(src - 3, src_stride, width, scales, sum3, sum5,
- square_sum3, square_sum5, ma343, ma444, ma565[0], b343,
- b444, b565[0]);
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, ma343, ma444[0], ma565[0], b343, b444[0],
+ b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
- for (int y = height >> 1; y != 0; --y) {
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
Circulate4PointersBy2<uint16_t>(sum3);
Circulate4PointersBy2<uint32_t>(square_sum3);
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxFilter(src, src + 2 * src_stride - 3, src_stride, width, scales, w0, w2,
- sum3, sum5, square_sum3, square_sum5, ma343, ma444, ma565, b343,
- b444, b565, dst, dst_stride);
- src += 2 * src_stride;
- dst += 2 * dst_stride;
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, ma343,
+ ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
Circulate4PointersBy2<uint16_t>(ma343);
Circulate4PointersBy2<uint32_t>(b343);
std::swap(ma444[0], ma444[2]);
@@ -1669,23 +2201,55 @@
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, ma343, ma444, ma565, b343, b444, b565,
+ dst);
+ }
if ((height & 1) != 0) {
- Circulate4PointersBy2<uint16_t>(sum3);
- Circulate4PointersBy2<uint32_t>(square_sum3);
- Circulate5PointersBy2<uint16_t>(sum5);
- Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxFilterLastRow(src, src + 2 * src_stride - 3, width, scales, w0, w2, sum3,
- sum5, square_sum3, square_sum5, ma343, ma444, ma565, b343,
- b444, b565, dst);
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5,
+ ma343[0], ma444[0], ma565[0], b343[0], b444[0], b565[0],
+ dst);
}
}
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
- const ptrdiff_t src_stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst, const ptrdiff_t dst_stride) {
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
@@ -1703,39 +2267,64 @@
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
assert(scale != 0);
- BoxSum<5>(src - 2 * src_stride - 3, src_stride, 2, sum_stride, sum5[1],
- square_sum5[1]);
+ BoxSum<5>(top_border, top_border_stride, sum_stride, sum5[1], square_sum5[1]);
sum5[0] = sum5[1];
square_sum5[0] = square_sum5[1];
- BoxSumFilterPreProcess5(src - 3, src_stride, width, scale, sum5, square_sum5,
- ma565[0], b565[0]);
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, ma565[0],
+ b565[0]);
sum5[0] = sgr_buffer->sum5;
square_sum5[0] = sgr_buffer->square_sum5;
- for (int y = height >> 1; y != 0; --y) {
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxFilterPass1(src, src + 2 * src_stride - 3, src_stride, sum5, square_sum5,
- width, scale, w0, ma565, b565, dst, dst_stride);
- src += 2 * src_stride;
- dst += 2 * dst_stride;
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ scale, w0, ma565, b565, dst);
+ }
if ((height & 1) != 0) {
- Circulate5PointersBy2<uint16_t>(sum5);
- Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxFilterPass1LastRow(src, src + 2 * src_stride - 3, width, scale, w0, sum5,
- square_sum5, ma565[0], b565[0], dst);
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src + 3, bottom_border + bottom_border_stride, width,
+ scale, w0, sum5, square_sum5, ma565[0], b565[0], dst);
}
}
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const uint8_t* src,
- const ptrdiff_t src_stride, const int width,
- const int height, SgrBuffer* const sgr_buffer,
- uint8_t* dst, const ptrdiff_t dst_stride) {
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
- const auto temp_stride = Align<ptrdiff_t>(width, 8);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
const ptrdiff_t sum_stride = temp_stride + 8;
const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
@@ -1758,24 +2347,44 @@
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
assert(scale != 0);
- BoxSum<3>(src - 2 * src_stride - 2, src_stride, 2, sum_stride, sum3[0],
- square_sum3[0]);
- BoxSumFilterPreProcess3<false>(src - 2, width, scale, sum3, square_sum3,
- ma343[0], nullptr, b343[0], nullptr);
+ BoxSum<3>(top_border, top_border_stride, sum_stride, sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3, ma343[0],
+ nullptr, b343[0], nullptr);
Circulate3PointersBy1<uint16_t>(sum3);
Circulate3PointersBy1<uint32_t>(square_sum3);
- BoxSumFilterPreProcess3<true>(src + src_stride - 2, width, scale, sum3,
- square_sum3, ma343[1], ma444[0], b343[1],
- b444[0]);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, ma343[1],
+ ma444[0], b343[1], b444[0]);
- int y = height;
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
do {
Circulate3PointersBy1<uint16_t>(sum3);
Circulate3PointersBy1<uint32_t>(square_sum3);
- BoxFilterPass2(src, src + 2 * src_stride - 2, width, scale, w0, sum3,
- square_sum3, ma343, ma444, b343, b444, dst);
- src += src_stride;
- dst += dst_stride;
+ BoxFilterPass2(src, bottom_border, width, scale, w0, sum3, square_sum3,
+ ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -1786,30 +2395,35 @@
// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
// the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
-void SelfGuidedFilter_NEON(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- const ptrdiff_t source_stride,
- const ptrdiff_t dest_stride, const int width,
- const int height,
- RestorationBuffer* const restoration_buffer) {
+void SelfGuidedFilter_NEON(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
auto* const dst = static_cast<uint8_t*>(dest);
SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
if (radius_pass_1 == 0) {
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(restoration_info, src, source_stride, width, height,
- sgr_buffer, dst, dest_stride);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(restoration_info, src, source_stride, width, height,
- sgr_buffer, dst, dest_stride);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(restoration_info, src, source_stride, width, height,
- sgr_buffer, dst, dest_stride);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
@@ -1828,7 +2442,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/mask_blend_neon.cc b/libgav1/src/dsp/arm/mask_blend_neon.cc
index 21f3fb1..ee50923 100644
--- a/libgav1/src/dsp/arm/mask_blend_neon.cc
+++ b/libgav1/src/dsp/arm/mask_blend_neon.cc
@@ -84,20 +84,19 @@
const int16x8_t pred_mask_0,
const int16x8_t pred_mask_1, uint8_t* dst,
const ptrdiff_t dst_stride) {
- const int16x4_t pred_val_0_lo = vld1_s16(pred_0);
- const int16x4_t pred_val_0_hi = vld1_s16(pred_0 + 4);
- const int16x4_t pred_val_1_lo = vld1_s16(pred_1);
- const int16x4_t pred_val_1_hi = vld1_s16(pred_1 + 4);
+ const int16x8_t pred_val_0 = vld1q_s16(pred_0);
+ const int16x8_t pred_val_1 = vld1q_s16(pred_1);
// int res = (mask_value * prediction_0[x] +
// (64 - mask_value) * prediction_1[x]) >> 6;
const int32x4_t weighted_pred_0_lo =
- vmull_s16(vget_low_s16(pred_mask_0), pred_val_0_lo);
+ vmull_s16(vget_low_s16(pred_mask_0), vget_low_s16(pred_val_0));
const int32x4_t weighted_pred_0_hi =
- vmull_s16(vget_high_s16(pred_mask_0), pred_val_0_hi);
- const int32x4_t weighted_combo_lo =
- vmlal_s16(weighted_pred_0_lo, vget_low_s16(pred_mask_1), pred_val_1_lo);
+ vmull_s16(vget_high_s16(pred_mask_0), vget_high_s16(pred_val_0));
+ const int32x4_t weighted_combo_lo = vmlal_s16(
+ weighted_pred_0_lo, vget_low_s16(pred_mask_1), vget_low_s16(pred_val_1));
const int32x4_t weighted_combo_hi =
- vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1), pred_val_1_hi);
+ vmlal_s16(weighted_pred_0_hi, vget_high_s16(pred_mask_1),
+ vget_high_s16(pred_val_1));
// dst[x] = static_cast<Pixel>(
// Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
// (1 << kBitdepth8) - 1));
@@ -433,7 +432,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/motion_field_projection_neon.cc b/libgav1/src/dsp/arm/motion_field_projection_neon.cc
index 8caba7d..3e731b2 100644
--- a/libgav1/src/dsp/arm/motion_field_projection_neon.cc
+++ b/libgav1/src/dsp/arm/motion_field_projection_neon.cc
@@ -382,7 +382,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/motion_vector_search_neon.cc b/libgav1/src/dsp/arm/motion_vector_search_neon.cc
index 8a403a6..da3ba17 100644
--- a/libgav1/src/dsp/arm/motion_vector_search_neon.cc
+++ b/libgav1/src/dsp/arm/motion_vector_search_neon.cc
@@ -256,7 +256,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/obmc_neon.cc b/libgav1/src/dsp/arm/obmc_neon.cc
index 66ad663..1111a90 100644
--- a/libgav1/src/dsp/arm/obmc_neon.cc
+++ b/libgav1/src/dsp/arm/obmc_neon.cc
@@ -380,7 +380,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/super_res_neon.cc b/libgav1/src/dsp/arm/super_res_neon.cc
index d77b9c7..91537c4 100644
--- a/libgav1/src/dsp/arm/super_res_neon.cc
+++ b/libgav1/src/dsp/arm/super_res_neon.cc
@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/arm/common_neon.h"
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
@@ -20,8 +19,10 @@
#include <arm_neon.h>
+#include "src/dsp/arm/common_neon.h"
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
#include "src/utils/constants.h"
namespace libgav1 {
@@ -30,57 +31,265 @@
namespace low_bitdepth {
namespace {
-void ComputeSuperRes_NEON(const void* source, const int upscaled_width,
- const int initial_subpixel_x, const int step,
- void* const dest) {
- const auto* src = static_cast<const uint8_t*>(source);
- auto* dst = static_cast<uint8_t*>(dest);
- src -= kSuperResFilterTaps >> 1;
-
- int p = initial_subpixel_x;
- uint16x8_t weighted_src[8];
- for (int x = 0; x < upscaled_width; x += 8) {
- for (int i = 0; i < kSuperResFilterTaps; ++i, p += step) {
- const uint8x8_t src_x = vld1_u8(&src[p >> kSuperResScaleBits]);
- const int remainder = p & kSuperResScaleMask;
- const uint8x8_t filter =
- vld1_u8(kUpscaleFilterUnsigned[remainder >> kSuperResExtraBits]);
- weighted_src[i] = vmull_u8(src_x, filter);
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint8x8_t filter[8];
+ uint8x16_t d[kSuperResFilterTaps / 2];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ filter[i] =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
}
- Transpose8x8(weighted_src);
+ Transpose8x8(filter, d);
+ vst1q_u8(dst, d[0]);
+ dst += 16;
+ vst1q_u8(dst, d[1]);
+ dst += 16;
+ vst1q_u8(dst, d[2]);
+ dst += 16;
+ vst1q_u8(dst, d[3]);
+ dst += 16;
+ } while (--x != 0);
+}
- // Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
- // Maximum sum: 255*171 == 0xAA55
- // The sum is clipped to [0, 255], so adding all positive and then
- // subtracting all negative with saturation is sufficient.
- // 0 1 2 3 4 5 6 7
- // tap sign: - + - + + - + -
- uint16x8_t res = weighted_src[1];
- res = vaddq_u16(res, weighted_src[3]);
- res = vaddq_u16(res, weighted_src[4]);
- res = vaddq_u16(res, weighted_src[6]);
- res = vqsubq_u16(res, weighted_src[0]);
- res = vqsubq_u16(res, weighted_src[2]);
- res = vqsubq_u16(res, weighted_src[5]);
- res = vqsubq_u16(res, weighted_src[7]);
- vst1_u8(&dst[x], vqrshrn_n_u16(res, kFilterBits));
+// Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
+// Maximum sum: 255*171 == 0xAA55
+// The sum is clipped to [0, 255], so adding all positive and then
+// subtracting all negative with saturation is sufficient.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
+ const uint8_t** coefficients) {
+ uint8x16_t f[kSuperResFilterTaps / 2];
+ for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
+ f[i] = vld1q_u8(*coefficients);
}
+ uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
+ res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
+ res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
+ res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
+ uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
+ temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
+ temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
+ temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
+ res = vqsubq_u16(res, temp);
+ return vqrshrn_n_u16(res, kFilterBits);
+}
+
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint8x8_t sr[8];
+ uint8x16_t s[8];
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ // The below code calculates up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. kSuperResHorizontalPadding accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
+ s[i] = vcombine_u8(sr[i], s_hi);
+ }
+ Transpose8x16(s);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_low_u8(s[0]);
+ sr[1] = vget_low_u8(s[1]);
+ sr[2] = vget_low_u8(s[2]);
+ sr[3] = vget_low_u8(s[3]);
+ sr[4] = vget_low_u8(s[4]);
+ sr[5] = vget_low_u8(s[5]);
+ sr[6] = vget_low_u8(s[6]);
+ sr[7] = vget_low_u8(s[7]);
+ const uint8x8_t d0 = SuperRes(sr, &filter);
+ // Do not use loop for the following 8 instructions, since the compiler
+ // will generate redundant code.
+ sr[0] = vget_high_u8(s[0]);
+ sr[1] = vget_high_u8(s[1]);
+ sr[2] = vget_high_u8(s[2]);
+ sr[3] = vget_high_u8(s[3]);
+ sr[4] = vget_high_u8(s[4]);
+ sr[5] = vget_high_u8(s[5]);
+ sr[6] = vget_high_u8(s[6]);
+ sr[7] = vget_high_u8(s[7]);
+ const uint8x8_t d1 = SuperRes(sr, &filter);
+ vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
}
void Init8bpp() {
Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
- dsp->super_res_row = ComputeSuperRes_NEON;
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON;
}
} // namespace
} // namespace low_bitdepth
-void SuperResInit_NEON() { low_bitdepth::Init8bpp(); }
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+void SuperResCoefficients_NEON(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ uint16x8_t filter[8];
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ const uint8x8_t filter_8 =
+ vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
+ kSuperResExtraBits]);
+ // uint8_t -> uint16_t
+ filter[i] = vmovl_u8(filter_8);
+ }
+
+ Transpose8x8(filter);
+
+ vst1q_u16(dst, filter[0]);
+ dst += 8;
+ vst1q_u16(dst, filter[1]);
+ dst += 8;
+ vst1q_u16(dst, filter[2]);
+ dst += 8;
+ vst1q_u16(dst, filter[3]);
+ dst += 8;
+ vst1q_u16(dst, filter[4]);
+ dst += 8;
+ vst1q_u16(dst, filter[5]);
+ dst += 8;
+ vst1q_u16(dst, filter[6]);
+ dst += 8;
+ vst1q_u16(dst, filter[7]);
+ dst += 8;
+ } while (--x != 0);
+}
+
+// The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
+// subtracting all negative with saturation will clip to zero.
+// 0 1 2 3 4 5 6 7
+// tap sign: - + - + + - + -
+inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
+ const uint16_t** coefficients, int bitdepth) {
+ uint16x8_t f[kSuperResFilterTaps];
+ for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
+ f[i] = vld1q_u16(*coefficients);
+ }
+
+ uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
+ res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
+
+ uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
+ temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
+
+ res_lo = vqsubq_u32(res_lo, temp_lo);
+
+ uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
+ res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
+
+ uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
+ temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
+
+ res_hi = vqsubq_u32(res_hi, temp_hi);
+
+ const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
+ vqrshrn_n_u32(res_hi, kFilterBits));
+
+ // Clip the result at (1 << bd) - 1.
+ return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
+}
+
+template <int bitdepth>
+void SuperRes_NEON(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ uint16x8_t sr[8];
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalBorder accounts for this.
+ do {
+ for (int i = 0; i < 8; ++i, subpixel_x += step) {
+ sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
+ }
+
+ Transpose8x8(sr);
+
+ const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
+ vst1q_u16(dst_ptr, d0);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ dsp->super_res_coefficients = SuperResCoefficients_NEON;
+ dsp->super_res = SuperRes_NEON<10>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void SuperResInit_NEON() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/super_res_neon.h b/libgav1/src/dsp/arm/super_res_neon.h
index f51785d..65e48c5 100644
--- a/libgav1/src/dsp/arm/super_res_neon.h
+++ b/libgav1/src/dsp/arm/super_res_neon.h
@@ -31,7 +31,10 @@
#if LIBGAV1_ENABLE_NEON
#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_NEON
-#define LIBGAV1_Dsp8bpp_SuperResClip LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_NEON
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_NEON
#endif // LIBGAV1_ENABLE_NEON
#endif // LIBGAV1_SRC_DSP_ARM_SUPER_RES_NEON_H_
diff --git a/libgav1/src/dsp/arm/warp_neon.cc b/libgav1/src/dsp/arm/warp_neon.cc
index 7a41998..c7fb739 100644
--- a/libgav1/src/dsp/arm/warp_neon.cc
+++ b/libgav1/src/dsp/arm/warp_neon.cc
@@ -289,7 +289,7 @@
const int16x8_t sum = vld1q_s16(tmp);
vst1_u8(reinterpret_cast<uint8_t*>(dst_row), vqmovun_s16(sum));
}
-#else // !defined(__aarch64__)
+#else // !defined(__aarch64__)
int16x8_t filter[8];
for (int x = 0; x < 8; ++x) {
const int offset =
@@ -442,7 +442,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/arm/weight_mask_neon.cc b/libgav1/src/dsp/arm/weight_mask_neon.cc
index 49d3be0..7e5bff0 100644
--- a/libgav1/src/dsp/arm/weight_mask_neon.cc
+++ b/libgav1/src/dsp/arm/weight_mask_neon.cc
@@ -451,7 +451,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_NEON
+#else // !LIBGAV1_ENABLE_NEON
namespace libgav1 {
namespace dsp {
diff --git a/libgav1/src/dsp/average_blend.cc b/libgav1/src/dsp/average_blend.cc
index a59abb0..d3ec21f 100644
--- a/libgav1/src/dsp/average_blend.cc
+++ b/libgav1/src/dsp/average_blend.cc
@@ -76,9 +76,7 @@
Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
-#ifndef LIBGAV1_Dsp10bpp_AverageBlend
dsp->average_blend = AverageBlend_C<10, uint16_t>;
-#endif
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp10bpp_AverageBlend
diff --git a/libgav1/src/dsp/cdef.cc b/libgav1/src/dsp/cdef.cc
index 95e5a4a..0b50517 100644
--- a/libgav1/src/dsp/cdef.cc
+++ b/libgav1/src/dsp/cdef.cc
@@ -41,7 +41,7 @@
template <int bitdepth, typename Pixel>
void CdefDirection_C(const void* const source, ptrdiff_t stride,
- int* const direction, int* const variance) {
+ uint8_t* const direction, int* const variance) {
assert(direction != nullptr);
assert(variance != nullptr);
const auto* src = static_cast<const Pixel*>(source);
diff --git a/libgav1/src/dsp/cdef.h b/libgav1/src/dsp/cdef.h
index 2d70d2c..b820b77 100644
--- a/libgav1/src/dsp/cdef.h
+++ b/libgav1/src/dsp/cdef.h
@@ -30,6 +30,7 @@
// The order of includes is important as each tests for a superior version
// before setting the base.
// clang-format off
+#include "src/dsp/x86/cdef_avx2.h"
#include "src/dsp/x86/cdef_sse4.h"
// clang-format on
// IWYU pragma: end_exports
diff --git a/libgav1/src/dsp/common.h b/libgav1/src/dsp/common.h
index 8ce3211..d614a81 100644
--- a/libgav1/src/dsp/common.h
+++ b/libgav1/src/dsp/common.h
@@ -25,7 +25,7 @@
namespace libgav1 {
-enum { kSgrStride = kRestorationUnitWidth + 8 }; // anonymous enum
+enum { kSgrStride = kRestorationUnitWidth + 32 }; // anonymous enum
// Self guided projection filter.
struct SgrProjInfo {
@@ -57,8 +57,9 @@
alignas(kMaxAlignment) uint32_t b343[4 * kRestorationUnitWidth];
alignas(kMaxAlignment) uint32_t b444[3 * kRestorationUnitWidth];
alignas(kMaxAlignment) uint32_t b565[2 * kRestorationUnitWidth];
- alignas(kMaxAlignment) uint16_t
- temp_buffer[12 * (kRestorationUnitHeight + 2)];
+ // The following 2 buffers are only used by the C functions. Since SgrBuffer
+ // is smaller than |wiener_buffer| in RestorationBuffer which is an union,
+ // it's OK to always keep the following 2 buffers.
alignas(kMaxAlignment) uint8_t ma[kSgrStride]; // [0, 255]
// b is less than 2^16 for 8-bit. However, making it a template slows down the
// C function by 5%. So b is fixed to 32-bit.
diff --git a/libgav1/src/dsp/constants.cc b/libgav1/src/dsp/constants.cc
index 0099ca3..1b85795 100644
--- a/libgav1/src/dsp/constants.cc
+++ b/libgav1/src/dsp/constants.cc
@@ -20,7 +20,7 @@
// Each set of 7 taps is padded with a 0 to easily align and pack into the high
// and low 8 bytes. This way, we can load 16 at a time to fit mulhi and mullo.
-const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
+alignas(16) const int8_t kFilterIntraTaps[kNumFilterIntraPredictors][8][8] = {
{{-6, 10, 0, 0, 0, 12, 0, 0},
{-5, 2, 10, 0, 0, 9, 0, 0},
{-3, 1, 1, 10, 0, 7, 0, 0},
diff --git a/libgav1/src/dsp/convolve.cc b/libgav1/src/dsp/convolve.cc
index c8df357..727b4af 100644
--- a/libgav1/src/dsp/convolve.cc
+++ b/libgav1/src/dsp/convolve.cc
@@ -226,8 +226,9 @@
void ConvolveCompound2D_C(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
- const int vertical_filter_index, const int subpixel_x,
- const int subpixel_y, const int width,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
const int height, void* prediction,
const ptrdiff_t pred_stride) {
// All compound functions output to the predictor buffer with |pred_stride|
@@ -257,16 +258,17 @@
const auto* src = static_cast<const Pixel*>(reference) -
kVerticalOffset * src_stride - kHorizontalOffset;
auto* dest = static_cast<uint16_t*>(prediction);
- int filter_id = (subpixel_x >> 6) & kSubPixelMask;
- // If |filter_id| == 0 then ConvolveVertical() should be called.
- assert(filter_id != 0);
+
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
int y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
}
intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
} while (++x < width);
@@ -278,16 +280,15 @@
// Vertical filter.
filter_index = GetFilterIndex(vertical_filter_index, height);
intermediate = intermediate_result;
- filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- // If |filter_id| == 0 then ConvolveHorizontal() should be called.
- assert(filter_id != 0);
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
intermediate[k * intermediate_stride + x];
}
sum = RightShiftWithRounding(sum, kRoundBitsVertical - 1);
@@ -308,9 +309,10 @@
template <int bitdepth, typename Pixel>
void Convolve2D_C(const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index,
- const int vertical_filter_index, const int subpixel_x,
- const int subpixel_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int vertical_filter_index,
+ const int horizontal_filter_id, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
constexpr int kRoundBitsHorizontal = (bitdepth == 12)
? kInterRoundBitsHorizontal12bpp
: kInterRoundBitsHorizontal;
@@ -336,16 +338,16 @@
kVerticalOffset * src_stride - kHorizontalOffset;
auto* dest = static_cast<Pixel*>(prediction);
const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
- int filter_id = (subpixel_x >> 6) & kSubPixelMask;
- // If |filter_id| == 0 then ConvolveVertical() should be called.
- assert(filter_id != 0);
+ // If |horizontal_filter_id| == 0 then ConvolveVertical() should be called.
+ assert(horizontal_filter_id != 0);
int y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
}
intermediate[x] = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
} while (++x < width);
@@ -357,16 +359,15 @@
// Vertical filter.
filter_index = GetFilterIndex(vertical_filter_index, height);
intermediate = intermediate_result;
- filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- // If |filter_id| == 0 then ConvolveHorizontal() should be called.
- assert(filter_id != 0);
+ // If |vertical_filter_id| == 0 then ConvolveHorizontal() should be called.
+ assert(vertical_filter_id != 0);
y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
intermediate[k * intermediate_stride + x];
}
dest[x] = Clip3(RightShiftWithRounding(sum, kRoundBitsVertical - 1), 0,
@@ -388,8 +389,9 @@
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/,
- const int width, const int height, void* prediction,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
const ptrdiff_t pred_stride) {
constexpr int kRoundBitsHorizontal = (bitdepth == 12)
? kInterRoundBitsHorizontal12bpp
@@ -400,7 +402,6 @@
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<Pixel*>(prediction);
const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
- const int filter_id = (subpixel_x >> 6) & kSubPixelMask;
const int max_pixel_value = (1 << bitdepth) - 1;
int y = 0;
do {
@@ -408,7 +409,8 @@
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
}
sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
dest[x] = Clip3(RightShiftWithRounding(sum, bits), 0, max_pixel_value);
@@ -429,8 +431,9 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y,
- const int width, const int height, void* prediction,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
@@ -438,9 +441,8 @@
static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
auto* dest = static_cast<Pixel*>(prediction);
const ptrdiff_t dest_stride = pred_stride / sizeof(Pixel);
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
// Copy filters must call ConvolveCopy().
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
const int max_pixel_value = (1 << bitdepth) - 1;
int y = 0;
@@ -449,7 +451,7 @@
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
src[k * src_stride + x];
}
dest[x] = Clip3(RightShiftWithRounding(sum, kFilterBits - 1), 0,
@@ -466,8 +468,9 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/,
- const int width, const int height, void* prediction,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
auto* dest = static_cast<uint8_t*>(prediction);
@@ -484,8 +487,9 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/,
- const int width, const int height, void* prediction,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
const ptrdiff_t pred_stride) {
// All compound functions output to the predictor buffer with |pred_stride|
// equal to |width|.
@@ -521,8 +525,9 @@
void ConvolveCompoundHorizontal_C(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t pred_stride) {
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
// All compound functions output to the predictor buffer with |pred_stride|
// equal to |width|.
assert(pred_stride == width);
@@ -535,16 +540,16 @@
const auto* src = static_cast<const Pixel*>(reference) - kHorizontalOffset;
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = (subpixel_x >> 6) & kSubPixelMask;
// Copy filters must call ConvolveCopy().
- assert(filter_id != 0);
+ assert(horizontal_filter_id != 0);
int y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] * src[x + k];
+ sum += kHalfSubPixelFilters[filter_index][horizontal_filter_id][k] *
+ src[x + k];
}
sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
sum += (bitdepth == 8) ? 0 : kCompoundOffset;
@@ -566,9 +571,10 @@
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
// All compound functions output to the predictor buffer with |pred_stride|
// equal to |width|.
assert(pred_stride == width);
@@ -582,16 +588,15 @@
const auto* src =
static_cast<const Pixel*>(reference) - kVerticalOffset * src_stride;
auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
// Copy filters must call ConvolveCopy().
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
int y = 0;
do {
int x = 0;
do {
int sum = 0;
for (int k = 0; k < kSubPixelTaps; ++k) {
- sum += kHalfSubPixelFilters[filter_index][filter_id][k] *
+ sum += kHalfSubPixelFilters[filter_index][vertical_filter_id][k] *
src[k * src_stride + x];
}
sum = RightShiftWithRounding(sum, kRoundBitsHorizontal - 1);
@@ -610,11 +615,16 @@
// The output is the single prediction of the block, clipped to valid pixel
// range.
template <int bitdepth, typename Pixel>
-void ConvolveIntraBlockCopy2D_C(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy2D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
const auto* src = static_cast<const Pixel*>(reference);
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<Pixel*>(prediction);
@@ -660,11 +670,16 @@
// The filtering of intra block copy is simply the average of current and
// the next pixel.
template <int bitdepth, typename Pixel, bool is_horizontal>
-void ConvolveIntraBlockCopy1D_C(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveIntraBlockCopy1D_C(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
+ assert(width >= 4 && width <= kMaxSuperBlockSizeInPixels);
+ assert(height >= 4 && height <= kMaxSuperBlockSizeInPixels);
const auto* src = static_cast<const Pixel*>(reference);
const ptrdiff_t src_stride = reference_stride / sizeof(Pixel);
auto* dest = static_cast<Pixel*>(prediction);
diff --git a/libgav1/src/dsp/convolve.h b/libgav1/src/dsp/convolve.h
index 95019e2..5bc0bad 100644
--- a/libgav1/src/dsp/convolve.h
+++ b/libgav1/src/dsp/convolve.h
@@ -30,6 +30,7 @@
// The order of includes is important as each tests for a superior version
// before setting the base.
// clang-format off
+#include "src/dsp/x86/convolve_avx2.h"
#include "src/dsp/x86/convolve_sse4.h"
// clang-format on
diff --git a/libgav1/src/dsp/convolve.inc b/libgav1/src/dsp/convolve.inc
new file mode 100644
index 0000000..140648b
--- /dev/null
+++ b/libgav1/src/dsp/convolve.inc
@@ -0,0 +1,50 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Constants and utility functions used for convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+int GetNumTapsInFilter(const int filter_index) {
+ if (filter_index < 2) {
+ // Despite the names these only use 6 taps.
+ // kInterpolationFilterEightTap
+ // kInterpolationFilterEightTapSmooth
+ return 6;
+ }
+
+ if (filter_index == 2) {
+ // kInterpolationFilterEightTapSharp
+ return 8;
+ }
+
+ if (filter_index == 3) {
+ // kInterpolationFilterBilinear
+ return 2;
+ }
+
+ assert(filter_index > 3);
+ // For small sizes (width/height <= 4) the large filters are replaced with 4
+ // tap options.
+ // If the original filters were |kInterpolationFilterEightTap| or
+ // |kInterpolationFilterEightTapSharp| then it becomes
+ // |kInterpolationFilterSwitchable|.
+ // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
+ // tap filter.
+ return 4;
+}
+
+constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
+constexpr int kHorizontalOffset = 3;
+constexpr int kFilterIndexShift = 6;
diff --git a/libgav1/src/dsp/dsp.cc b/libgav1/src/dsp/dsp.cc
index c1df276..a3d7701 100644
--- a/libgav1/src/dsp/dsp.cc
+++ b/libgav1/src/dsp/dsp.cc
@@ -16,7 +16,6 @@
#include <mutex> // NOLINT (unapproved c++11 header)
-#include "src/dsp/arm/weight_mask_neon.h"
#include "src/dsp/average_blend.h"
#include "src/dsp/cdef.h"
#include "src/dsp/convolve.h"
@@ -24,6 +23,10 @@
#include "src/dsp/film_grain.h"
#include "src/dsp/intra_edge.h"
#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
+#include "src/dsp/intrapred_directional.h"
+#include "src/dsp/intrapred_filter.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/dsp/inverse_transform.h"
#include "src/dsp/loop_filter.h"
#include "src/dsp/loop_restoration.h"
@@ -39,6 +42,30 @@
namespace libgav1 {
namespace dsp_internal {
+void DspInit_C() {
+ dsp::AverageBlendInit_C();
+ dsp::CdefInit_C();
+ dsp::ConvolveInit_C();
+ dsp::DistanceWeightedBlendInit_C();
+ dsp::FilmGrainInit_C();
+ dsp::IntraEdgeInit_C();
+ dsp::IntraPredCflInit_C();
+ dsp::IntraPredDirectionalInit_C();
+ dsp::IntraPredFilterInit_C();
+ dsp::IntraPredInit_C();
+ dsp::IntraPredSmoothInit_C();
+ dsp::InverseTransformInit_C();
+ dsp::LoopFilterInit_C();
+ dsp::LoopRestorationInit_C();
+ dsp::MaskBlendInit_C();
+ dsp::MotionFieldProjectionInit_C();
+ dsp::MotionVectorSearchInit_C();
+ dsp::ObmcInit_C();
+ dsp::SuperResInit_C();
+ dsp::WarpInit_C();
+ dsp::WeightMaskInit_C();
+}
+
dsp::Dsp* GetWritableDspTable(int bitdepth) {
switch (bitdepth) {
case 8: {
@@ -62,31 +89,20 @@
void DspInit() {
static std::once_flag once;
std::call_once(once, []() {
- AverageBlendInit_C();
- CdefInit_C();
- ConvolveInit_C();
- DistanceWeightedBlendInit_C();
- FilmGrainInit_C();
- IntraEdgeInit_C();
- IntraPredInit_C();
- InverseTransformInit_C();
- LoopFilterInit_C();
- LoopRestorationInit_C();
- MaskBlendInit_C();
- MotionFieldProjectionInit_C();
- MotionVectorSearchInit_C();
- ObmcInit_C();
- SuperResInit_C();
- WarpInit_C();
- WeightMaskInit_C();
-#if LIBGAV1_ENABLE_SSE4_1
+ dsp_internal::DspInit_C();
+#if LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
const uint32_t cpu_features = GetCpuInfo();
+#if LIBGAV1_ENABLE_SSE4_1
if ((cpu_features & kSSE4_1) != 0) {
AverageBlendInit_SSE4_1();
CdefInit_SSE4_1();
ConvolveInit_SSE4_1();
DistanceWeightedBlendInit_SSE4_1();
+ FilmGrainInit_SSE4_1();
IntraEdgeInit_SSE4_1();
+ IntraPredCflInit_SSE4_1();
+ IntraPredDirectionalInit_SSE4_1();
+ IntraPredFilterInit_SSE4_1();
IntraPredInit_SSE4_1();
IntraPredCflInit_SSE4_1();
IntraPredSmoothInit_SSE4_1();
@@ -100,8 +116,22 @@
SuperResInit_SSE4_1();
WarpInit_SSE4_1();
WeightMaskInit_SSE4_1();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_SSE4_1();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
}
#endif // LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_ENABLE_AVX2
+ if ((cpu_features & kAVX2) != 0) {
+ CdefInit_AVX2();
+ ConvolveInit_AVX2();
+ LoopRestorationInit_AVX2();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ LoopRestorationInit10bpp_AVX2();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+ }
+#endif // LIBGAV1_ENABLE_AVX2
+#endif // LIBGAV1_ENABLE_SSE4_1 || LIBGAV1_ENABLE_AVX2
#if LIBGAV1_ENABLE_NEON
AverageBlendInit_NEON();
CdefInit_NEON();
@@ -111,7 +141,7 @@
IntraEdgeInit_NEON();
IntraPredCflInit_NEON();
IntraPredDirectionalInit_NEON();
- IntraPredFilterIntraInit_NEON();
+ IntraPredFilterInit_NEON();
IntraPredInit_NEON();
IntraPredSmoothInit_NEON();
InverseTransformInit_NEON();
@@ -124,6 +154,9 @@
SuperResInit_NEON();
WarpInit_NEON();
WeightMaskInit_NEON();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ InverseTransformInit10bpp_NEON();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
#endif // LIBGAV1_ENABLE_NEON
});
}
diff --git a/libgav1/src/dsp/dsp.h b/libgav1/src/dsp/dsp.h
index 1fa1560..153db7f 100644
--- a/libgav1/src/dsp/dsp.h
+++ b/libgav1/src/dsp/dsp.h
@@ -17,7 +17,7 @@
#ifndef LIBGAV1_SRC_DSP_DSP_H_
#define LIBGAV1_SRC_DSP_DSP_H_
-#include <cstddef> // ptrdiff_t
+#include <cstddef>
#include <cstdint>
#include <cstdlib>
@@ -79,6 +79,11 @@
kNumLoopFilterSizes
};
+enum : uint8_t {
+ kRow = 0,
+ kColumn = 1,
+};
+
//------------------------------------------------------------------------------
// ToString()
//
@@ -298,16 +303,20 @@
// 7.13.3).
// Apply the inverse transforms and add the residual to the destination frame
// for the transform type and block size |tx_size| starting at position
-// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D. |is_row|
-// signals the direction of the transform loop. |non_zero_coeff_count| is the
-// number of non zero coefficients in the block.
+// |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
+// |adjusted_tx_height| is the number of rows to process based on the non-zero
+// coefficient count in the block. It will be 1 (non-zero coefficient count ==
+// 1), 4 or a multiple of 8 up to 32 or the original transform height,
+// whichever is less.
using InverseTransformAddFunc = void (*)(TransformType tx_type,
TransformSize tx_size,
+ int adjusted_tx_height,
void* src_buffer, int start_x,
- int start_y, void* dst_frame,
- bool is_row, int non_zero_coeff_count);
+ int start_y, void* dst_frame);
+// The final dimension holds row and column transforms indexed with kRow and
+// kColumn.
using InverseTransformAddFuncs =
- InverseTransformAddFunc[kNum1DTransformSizes][kNum1DTransforms];
+ InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
//------------------------------------------------------------------------------
// Post processing.
@@ -325,7 +334,7 @@
// with |stride| given in bytes. |direction| and |variance| are output
// parameters and must not be nullptr.
using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
- int* direction, int* variance);
+ uint8_t* direction, int* variance);
// Cdef filtering function signature. Section 7.15.3.
// |source| is a pointer to the input block padded with kCdefLargeValue if at a
@@ -346,30 +355,53 @@
// |primary_strength| only, [2]: |secondary_strength| only.
using CdefFilteringFuncs = CdefFilteringFunc[2][3];
-// Upscaling process function signature. Section 7.16.
-// Operates on a single row.
-// |source| is the input frame buffer at the given row.
-// |dest| is the output row.
+// Upscaling coefficients function signature. Section 7.16.
+// This is an auxiliary function for SIMD optimizations and has no corresponding
+// C function. Different SIMD versions may have different outputs. So it must
+// pair with the corresponding version of SuperResFunc.
// |upscaled_width| is the width of the output frame.
// |step| is the number of subpixels to move the kernel for the next destination
// pixel.
// |initial_subpixel_x| is a base offset from which |step| increments.
-using SuperResRowFunc = void (*)(const void* source, const int upscaled_width,
- const int initial_subpixel_x, const int step,
- void* const dest);
+// |coefficients| is the upscale filter used by each pixel in a row.
+using SuperResCoefficientsFunc = void (*)(int upscaled_width,
+ int initial_subpixel_x, int step,
+ void* coefficients);
+
+// Upscaling process function signature. Section 7.16.
+// |coefficients| is the upscale filter used by each pixel in a row. It is not
+// used by the C function.
+// |source| is the input frame buffer. It will be line extended.
+// |source_stride| is given in pixels.
+// |dest| is the output buffer.
+// |dest_stride| is given in pixels.
+// |height| is the height of the block to be processed.
+// |downscaled_width| is the width of the input frame.
+// |upscaled_width| is the width of the output frame.
+// |step| is the number of subpixels to move the kernel for the next destination
+// pixel.
+// |initial_subpixel_x| is a base offset from which |step| increments.
+using SuperResFunc = void (*)(const void* coefficients, void* source,
+ ptrdiff_t source_stride, int height,
+ int downscaled_width, int upscaled_width,
+ int initial_subpixel_x, int step, void* dest,
+ ptrdiff_t dest_stride);
// Loop restoration function signature. Sections 7.16, 7.17.
-// |source| is the input frame buffer, which is deblocked and cdef filtered.
-// |dest| is the output.
// |restoration_info| contains loop restoration information, such as filter
// type, strength.
-// |source_stride| and |dest_stride| are given in pixels.
-// |buffer| contains buffers required for self guided filter and wiener filter.
-// They must be initialized before calling.
+// |source| is the input frame buffer, which is deblocked and cdef filtered.
+// |top_border| and |bottom_border| are the top and bottom borders.
+// |dest| is the output.
+// |stride| is given in pixels, and shared by |source| and |dest|.
+// |top_border_stride| and |bottom_border_stride| are given in pixels.
+// |restoration_buffer| contains buffers required for self guided filter and
+// wiener filter. They must be initialized before calling.
using LoopRestorationFunc = void (*)(
- const void* source, void* dest, const RestorationUnitInfo& restoration_info,
- ptrdiff_t source_stride, ptrdiff_t dest_stride, int width, int height,
- RestorationBuffer* buffer);
+ const RestorationUnitInfo& restoration_info, const void* source,
+ ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
+ const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
+ int height, RestorationBuffer* restoration_buffer, void* dest);
// Index 0 is Wiener Filter.
// Index 1 is Self Guided Restoration Filter.
@@ -383,7 +415,7 @@
// |vertical_filter_index|/|horizontal_filter_index| is the index to
// retrieve the type of filter to be applied for vertical/horizontal direction
// from the filter lookup table 'kSubPixelFilters'.
-// |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
+// |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
// |width| and |height| are width and height of the block to be filtered.
// |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
// x/y direction.
@@ -395,9 +427,10 @@
// be used.
using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
int horizontal_filter_index,
- int vertical_filter_index, int subpixel_x,
- int subpixel_y, int width, int height,
- void* prediction, ptrdiff_t pred_stride);
+ int vertical_filter_index,
+ int horizontal_filter_id, int vertical_filter_id,
+ int width, int height, void* prediction,
+ ptrdiff_t pred_stride);
// Convolve functions signature. Each points to one convolve function with
// a specific setting:
@@ -815,7 +848,8 @@
MvProjectionCompoundFunc mv_projection_compound[3];
MvProjectionSingleFunc mv_projection_single[3];
ObmcBlendFuncs obmc_blend;
- SuperResRowFunc super_res_row;
+ SuperResCoefficientsFunc super_res_coefficients;
+ SuperResFunc super_res;
WarpCompoundFunc warp_compound;
WarpFunc warp;
WeightMaskFuncs weight_mask;
@@ -834,6 +868,14 @@
namespace dsp_internal {
+// Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
+// functions if /arch:AVX2 is used across all sources.
+#if !LIBGAV1_TARGETING_AVX2 && \
+ (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
+#undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
+#endif
+
// Returns true if a more highly optimized version of |func| is not defined for
// the associated bitdepth or if it is forcibly enabled with
// LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
@@ -848,6 +890,12 @@
// NEON support is the only extension available for ARM and it is always
// required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
// true and can be omitted.
+#define DSP_ENABLED_8BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
+#define DSP_ENABLED_10BPP_AVX2(func) \
+ (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
+ LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
#define DSP_ENABLED_8BPP_SSE4_1(func) \
(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
@@ -855,6 +903,11 @@
(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
+// Initializes C-only function pointers. Note some entries may be set to
+// nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
+// for use in tests only, it is not thread-safe.
+void DspInit_C();
+
// Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
// exist. This version is meant for use by test or dsp/*Init() functions only.
dsp::Dsp* GetWritableDspTable(int bitdepth);
diff --git a/libgav1/src/dsp/film_grain.cc b/libgav1/src/dsp/film_grain.cc
index 2ee290b..41d1dd0 100644
--- a/libgav1/src/dsp/film_grain.cc
+++ b/libgav1/src/dsp/film_grain.cc
@@ -209,7 +209,7 @@
luma += luma_grain[(luma_y + i) * kLumaWidth + (luma_x + j)];
} while (++j <= subsampling_x);
} while (++i <= subsampling_y);
- luma = RightShiftWithRounding(luma, subsampling_x + subsampling_y);
+ luma = SubsampledValue(luma, subsampling_x + subsampling_y);
const int coeff_u = params.auto_regression_coeff_u[pos];
const int coeff_v = params.auto_regression_coeff_v[pos];
sum_u += luma * coeff_u;
diff --git a/libgav1/src/dsp/film_grain.h b/libgav1/src/dsp/film_grain.h
index fe93270..f75a354 100644
--- a/libgav1/src/dsp/film_grain.h
+++ b/libgav1/src/dsp/film_grain.h
@@ -25,6 +25,14 @@
// ARM:
#include "src/dsp/arm/film_grain_neon.h"
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/film_grain_sse4.h"
+// clang-format on
+
// IWYU pragma: end_exports
namespace libgav1 {
diff --git a/libgav1/src/dsp/intrapred.cc b/libgav1/src/dsp/intrapred.cc
index 4bcb580..4520c2c 100644
--- a/libgav1/src/dsp/intrapred.cc
+++ b/libgav1/src/dsp/intrapred.cc
@@ -19,21 +19,18 @@
#include <cstddef>
#include <cstdint>
#include <cstdlib>
-#include <cstring> // memset
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#include "src/utils/memory.h"
namespace libgav1 {
namespace dsp {
namespace {
-constexpr TransformSize kTransformSizesLargerThan32x32[] = {
- kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
- kTransformSize64x32, kTransformSize64x64};
-
template <int block_width, int block_height, typename Pixel>
struct IntraPredFuncs_C {
IntraPredFuncs_C() = delete;
@@ -50,12 +47,6 @@
const void* left_column);
static void Paeth(void* dest, ptrdiff_t stride, const void* top_row,
const void* left_column);
- static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
- const void* left_column);
- static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
- const void* left_column);
- static void SmoothHorizontal(void* dest, ptrdiff_t stride,
- const void* top_row, const void* left_column);
};
// Intra-predictors that require bitdepth.
@@ -190,16 +181,6 @@
}
}
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b) {
- return static_cast<Pixel>((a + b + 1) >> 1);
-}
-
-template <typename Pixel>
-inline Pixel Average(Pixel a, Pixel b, Pixel c) {
- return static_cast<Pixel>((a + 2 * b + c + 2) >> 2);
-}
-
// IntraPredFuncs_C::Paeth
template <int block_width, int block_height, typename Pixel>
void IntraPredFuncs_C<block_width, block_height, Pixel>::Paeth(
@@ -238,110 +219,6 @@
}
}
-constexpr uint8_t kSmoothWeights[] = {
- // block dimension = 4
- 255, 149, 85, 64,
- // block dimension = 8
- 255, 197, 146, 105, 73, 50, 37, 32,
- // block dimension = 16
- 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
- // block dimension = 32
- 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
- 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
- // block dimension = 64
- 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
- 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
- 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
- 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
-
-// IntraPredFuncs_C::Smooth
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::Smooth(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel top_right = top[block_width - 1];
- const Pixel bottom_left = left[block_height - 1];
- static_assert(
- block_width >= 4 && block_height >= 4,
- "Weights for smooth predictor undefined for block width/height < 4");
- const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
- const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
- uint32_t pred = weights_y[y] * top[x];
- pred += weights_x[x] * left[y];
- pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
- pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
- // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
- // + 256. With the descale there's no need for saturation.
- dst[x] = static_cast<Pixel>(
- RightShiftWithRounding(pred, kSmoothWeightScale + 1));
- }
- dst += stride;
- }
-}
-
-// IntraPredFuncs_C::SmoothVertical
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel bottom_left = left[block_height - 1];
- static_assert(block_height >= 4,
- "Weights for smooth predictor undefined for block height < 4");
- const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_y[y]);
- uint32_t pred = weights_y[y] * top[x];
- pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
- dst[x] =
- static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
- }
- dst += stride;
- }
-}
-
-// IntraPredFuncs_C::SmoothHorizontal
-template <int block_width, int block_height, typename Pixel>
-void IntraPredFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
- void* const dest, ptrdiff_t stride, const void* const top_row,
- const void* const left_column) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- const Pixel top_right = top[block_width - 1];
- static_assert(block_width >= 4,
- "Weights for smooth predictor undefined for block width < 4");
- const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
- const uint16_t scale_value = (1 << kSmoothWeightScale);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(scale_value >= weights_x[x]);
- uint32_t pred = weights_x[x] * left[y];
- pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
- dst[x] =
- static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
- }
- dst += stride;
- }
-}
-
//------------------------------------------------------------------------------
// IntraPredBppFuncs_C
template <int fill, typename Pixel>
@@ -366,288 +243,7 @@
block_height);
}
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_C
-
-template <int bitdepth, typename Pixel>
-void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const FilterIntraPredictor pred, const int width,
- const int height) {
- const int kMaxPixel = (1 << bitdepth) - 1;
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
-
- assert(width <= 32 && height <= 32);
-
- Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
- memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
-
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
- int row0 = 0, row2 = 2;
- int ystep = 1;
- int y = 0;
- do {
- buffer[1][0] = left[y];
- buffer[row2][0] = left[y + 1];
- int x = 1;
- do {
- const Pixel p0 = buffer[row0][x - 1]; // top-left
- const Pixel p1 = buffer[row0][x + 0]; // top 0
- const Pixel p2 = buffer[row0][x + 1]; // top 1
- const Pixel p3 = buffer[row0][x + 2]; // top 2
- const Pixel p4 = buffer[row0][x + 3]; // top 3
- const Pixel p5 = buffer[1][x - 1]; // left 0
- const Pixel p6 = buffer[row2][x - 1]; // left 1
- for (int i = 0; i < 8; ++i) {
- const int xoffset = i & 0x03;
- const int yoffset = (i >> 2) * ystep;
- const int value = kFilterIntraTaps[pred][i][0] * p0 +
- kFilterIntraTaps[pred][i][1] * p1 +
- kFilterIntraTaps[pred][i][2] * p2 +
- kFilterIntraTaps[pred][i][3] * p3 +
- kFilterIntraTaps[pred][i][4] * p4 +
- kFilterIntraTaps[pred][i][5] * p5 +
- kFilterIntraTaps[pred][i][6] * p6;
- buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
- Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
- }
- x += 4;
- } while (x < width);
- memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
- dst += stride;
- memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
- dst += stride;
-
- // The final row becomes the top for the next pass.
- row0 ^= 2;
- row2 ^= 2;
- ystep = -ystep;
- y += 2;
- } while (y < height);
-}
-
-//------------------------------------------------------------------------------
-// CflIntraPredictor_C
-
-// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
-// |alpha| can be -16 to 16 (inclusive).
-template <int block_width, int block_height, int bitdepth, typename Pixel>
-void CflIntraPredictor_C(
- void* const dest, ptrdiff_t stride,
- const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int alpha) {
- auto* dst = static_cast<Pixel*>(dest);
- const int dc = dst[0];
- stride /= sizeof(Pixel);
- const int max_value = (1 << bitdepth) - 1;
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
- assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
- dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
- 0, max_value);
- }
- dst += stride;
- }
-}
-
-//------------------------------------------------------------------------------
-// CflSubsampler_C
-
-template <int block_width, int block_height, int bitdepth, typename Pixel,
- int subsampling_x, int subsampling_y>
-void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int max_luma_width, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
- assert(max_luma_width >= 4);
- assert(max_luma_height >= 4);
- const auto* src = static_cast<const Pixel*>(source);
- stride /= sizeof(Pixel);
- int sum = 0;
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- const ptrdiff_t luma_x =
- std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
- const ptrdiff_t luma_x_next = luma_x + stride;
- luma[y][x] =
- (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
- ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
- : 0))
- << (3 - subsampling_x - subsampling_y);
- sum += luma[y][x];
- }
- if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
- src += stride << subsampling_y;
- }
- }
- const int average = RightShiftWithRounding(
- sum, FloorLog2(block_width) + FloorLog2(block_height));
- for (int y = 0; y < block_height; ++y) {
- for (int x = 0; x < block_width; ++x) {
- luma[y][x] -= average;
- }
- }
-}
-
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- assert(xstep > 0);
-
- // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
- // |top[top_base_x]|. This corresponds to a 45 degree prediction.
- if (xstep == 64) {
- // 7.11.2.10. Intra edge upsample selection process
- // if ( d <= 0 || d >= 40 ) useUpsample = 0
- // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
- // |predictor_angle| is 45 the delta is also 45.
- assert(!upsampled_top);
- const Pixel* top_ptr = top + 1;
- for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
- memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
- }
- return;
- }
-
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int max_base_x = ((width + height) - 1) << upsample_shift;
- const int scale_bits = 6 - upsample_shift;
- const int base_step = 1 << upsample_shift;
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
-
- if (top_base_x >= max_base_x) {
- for (int i = y; i < height; ++i) {
- Memset(dst, top[max_base_x], width);
- dst += stride;
- }
- return;
- }
-
- const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
- int x = 0;
- do {
- if (top_base_x >= max_base_x) {
- Memset(dst + x, top[max_base_x], width - x);
- break;
- }
-
- const int val =
- top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
- dst[x] = RightShiftWithRounding(val, 5);
- top_base_x += base_step;
- } while (++x < width);
-
- dst += stride;
- top_x += xstep;
- } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
- const auto* const top = static_cast<const Pixel*>(top_row);
- const auto* const left = static_cast<const Pixel*>(left_column);
- auto* dst = static_cast<Pixel*>(dest);
- stride /= sizeof(Pixel);
-
- assert(xstep > 0);
- assert(ystep > 0);
-
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int scale_bits_x = 6 - upsample_top_shift;
- const int scale_bits_y = 6 - upsample_left_shift;
- const int min_base_x = -(1 << upsample_top_shift);
- const int base_step_x = 1 << upsample_top_shift;
- int y = 0;
- int top_x = -xstep;
- do {
- int top_base_x = top_x >> scale_bits_x;
- int left_y = (y << 6) - ystep;
- int x = 0;
- do {
- int val;
- if (top_base_x >= min_base_x) {
- const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
- val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
- } else {
- // Note this assumes an arithmetic shift to handle negative values.
- const int left_base_y = left_y >> scale_bits_y;
- const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
- assert(left_base_y >= -(1 << upsample_left_shift));
- val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
- }
- dst[x] = RightShiftWithRounding(val, 5);
- top_base_x += base_step_x;
- left_y -= ystep;
- } while (++x < width);
-
- top_x -= xstep;
- dst += stride;
- } while (++y < height);
-}
-
-template <typename Pixel>
-void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled_left) {
- const auto* const left = static_cast<const Pixel*>(left_column);
- stride /= sizeof(Pixel);
-
- assert(ystep > 0);
-
- const int upsample_shift = static_cast<int>(upsampled_left);
- const int scale_bits = 6 - upsample_shift;
- const int base_step = 1 << upsample_shift;
- // Zone3 never runs out of left_column values.
- assert((width + height - 1) << upsample_shift > // max_base_y
- ((ystep * width) >> scale_bits) +
- base_step * (height - 1)); // left_base_y
-
- int left_y = ystep;
- int x = 0;
- do {
- auto* dst = static_cast<Pixel*>(dest);
-
- int left_base_y = left_y >> scale_bits;
- int y = 0;
- do {
- const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
- const int val =
- left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
- dst[x] = RightShiftWithRounding(val, 5);
- dst += stride;
- left_base_y += base_step;
- } while (++y < height);
-
- left_y += ystep;
- } while (++x < width);
-}
-
-//------------------------------------------------------------------------------
+// -----------------------------------------------------------------------------
template <typename Pixel>
struct IntraPredDefs {
@@ -718,15 +314,7 @@
dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorHorizontal] = \
DEFS::_##W##x##H::Horizontal; \
dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorPaeth] = \
- DEFS::_##W##x##H::Paeth; \
- dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
- DEFS::_##W##x##H::Smooth; \
- dsp->intra_predictors[kTransformSize##W##x##H] \
- [kIntraPredictorSmoothVertical] = \
- DEFS::_##W##x##H::SmoothVertical; \
- dsp->intra_predictors[kTransformSize##W##x##H] \
- [kIntraPredictorSmoothHorizontal] = \
- DEFS::_##W##x##H::SmoothHorizontal
+ DEFS::_##W##x##H::Paeth
#define INIT_INTRAPREDICTORS(DEFS, DEFSBPP) \
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 4, 4); \
@@ -749,45 +337,11 @@
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 32); \
INIT_INTRAPREDICTORS_WxH(DEFS, DEFSBPP, 64, 64)
-#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
- dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
- CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
- dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
- CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
-
-#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
- INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
INIT_INTRAPREDICTORS(Defs, Defs8bpp);
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint8_t>;
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint8_t>;
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint8_t>;
- dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
- INIT_CFL_INTRAPREDICTORS(8, uint8_t);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -816,19 +370,6 @@
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
Defs::_4x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
- Defs::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
- Defs::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
- Defs::_4x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
Defs8bpp::_4x8::DcFill;
@@ -856,19 +397,6 @@
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
Defs::_4x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
- Defs::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
- Defs::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
- Defs::_4x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
Defs8bpp::_4x16::DcFill;
@@ -897,19 +425,6 @@
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
Defs::_4x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
- Defs::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
- Defs::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
- Defs::_4x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
Defs8bpp::_8x4::DcFill;
@@ -937,19 +452,6 @@
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
Defs::_8x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
- Defs::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
- Defs::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
- Defs::_8x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
Defs8bpp::_8x8::DcFill;
@@ -977,19 +479,6 @@
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
Defs::_8x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
- Defs::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
- Defs::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
- Defs::_8x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
Defs8bpp::_8x16::DcFill;
@@ -1018,19 +507,6 @@
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
Defs::_8x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
- Defs::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
- Defs::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
- Defs::_8x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
Defs8bpp::_8x32::DcFill;
@@ -1059,19 +535,6 @@
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
Defs::_8x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
- Defs::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
- Defs::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
- Defs::_8x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
Defs8bpp::_16x4::DcFill;
@@ -1100,19 +563,6 @@
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
Defs::_16x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
- Defs::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
- Defs::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
- Defs::_16x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
Defs8bpp::_16x8::DcFill;
@@ -1141,19 +591,6 @@
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
Defs::_16x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
- Defs::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
- Defs::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
- Defs::_16x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
Defs8bpp::_16x16::DcFill;
@@ -1182,19 +619,6 @@
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
Defs::_16x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
- Defs::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
- Defs::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
- Defs::_16x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
Defs8bpp::_16x32::DcFill;
@@ -1223,19 +647,6 @@
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
Defs::_16x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
- Defs::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
- Defs::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
- Defs::_16x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
Defs8bpp::_16x64::DcFill;
@@ -1264,19 +675,6 @@
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
Defs::_16x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
- Defs::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
- Defs::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
- Defs::_16x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
Defs8bpp::_32x8::DcFill;
@@ -1305,19 +703,6 @@
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
Defs::_32x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
- Defs::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
- Defs::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
- Defs::_32x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
Defs8bpp::_32x16::DcFill;
@@ -1346,19 +731,6 @@
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
Defs::_32x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
- Defs::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
- Defs::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
- Defs::_32x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
Defs8bpp::_32x32::DcFill;
@@ -1387,19 +759,6 @@
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
Defs::_32x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
- Defs::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
- Defs::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
- Defs::_32x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
Defs8bpp::_32x64::DcFill;
@@ -1428,19 +787,6 @@
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
Defs::_32x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
- Defs::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
- Defs::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
- Defs::_32x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
Defs8bpp::_64x16::DcFill;
@@ -1469,19 +815,6 @@
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
Defs::_64x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
- Defs::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
- Defs::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
- Defs::_64x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
Defs8bpp::_64x32::DcFill;
@@ -1510,19 +843,6 @@
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
Defs::_64x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
- Defs::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
- Defs::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
- Defs::_64x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
Defs8bpp::_64x64::DcFill;
@@ -1551,282 +871,7 @@
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
Defs::_64x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
- Defs::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
- Defs::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
- Defs::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
- dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x4] =
- CflIntraPredictor_C<4, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
- CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
- CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
- CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x8] =
- CflIntraPredictor_C<4, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
- CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
- CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
- CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x16] =
- CflIntraPredictor_C<4, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
- CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
- CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
- CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x4] =
- CflIntraPredictor_C<8, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
- CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
- CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
- CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x8] =
- CflIntraPredictor_C<8, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
- CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
- CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
- CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x16] =
- CflIntraPredictor_C<8, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
- CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
- CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
- CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x32] =
- CflIntraPredictor_C<8, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
- CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
- CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
- CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x4] =
- CflIntraPredictor_C<16, 4, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
- CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
- CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
- CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x8] =
- CflIntraPredictor_C<16, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
- CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
- CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
- CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x16] =
- CflIntraPredictor_C<16, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
- CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
- CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
- CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x32] =
- CflIntraPredictor_C<16, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
- CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
- CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
- CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x8] =
- CflIntraPredictor_C<32, 8, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
- CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
- CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
- CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x16] =
- CflIntraPredictor_C<32, 16, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
- CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
- CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
- CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x32] =
- CflIntraPredictor_C<32, 32, 8, uint8_t>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
- CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
- CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
- CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
-#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- // Cfl predictors are available only for transform sizes with max(width,
- // height) <= 32. Set all others to nullptr.
- for (const auto i : kTransformSizesLargerThan32x32) {
- dsp->cfl_intra_predictors[i] = nullptr;
- for (int j = 0; j < kNumSubsamplingTypes; ++j) {
- dsp->cfl_subsamplers[i][j] = nullptr;
- }
- }
} // NOLINT(readability/fn_size)
#if LIBGAV1_MAX_BITDEPTH >= 10
@@ -1838,14 +883,6 @@
assert(dsp != nullptr);
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
INIT_INTRAPREDICTORS(DefsHbd, Defs10bpp);
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint16_t>;
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint16_t>;
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint16_t>;
- dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
- INIT_CFL_INTRAPREDICTORS(10, uint16_t);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcFill] =
@@ -1875,19 +912,6 @@
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorPaeth] =
DefsHbd::_4x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
- DefsHbd::_4x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorDcFill] =
Defs10bpp::_4x8::DcFill;
@@ -1916,19 +940,6 @@
dsp->intra_predictors[kTransformSize4x8][kIntraPredictorPaeth] =
DefsHbd::_4x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
- DefsHbd::_4x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorDcFill] =
Defs10bpp::_4x16::DcFill;
@@ -1957,19 +968,6 @@
dsp->intra_predictors[kTransformSize4x16][kIntraPredictorPaeth] =
DefsHbd::_4x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
- DefsHbd::_4x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_4x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_4x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorDcFill] =
Defs10bpp::_8x4::DcFill;
@@ -1998,19 +996,6 @@
dsp->intra_predictors[kTransformSize8x4][kIntraPredictorPaeth] =
DefsHbd::_8x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
- DefsHbd::_8x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorDcFill] =
Defs10bpp::_8x8::DcFill;
@@ -2039,19 +1024,6 @@
dsp->intra_predictors[kTransformSize8x8][kIntraPredictorPaeth] =
DefsHbd::_8x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
- DefsHbd::_8x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorDcFill] =
Defs10bpp::_8x16::DcFill;
@@ -2080,19 +1052,6 @@
dsp->intra_predictors[kTransformSize8x16][kIntraPredictorPaeth] =
DefsHbd::_8x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
- DefsHbd::_8x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorDcFill] =
Defs10bpp::_8x32::DcFill;
@@ -2121,19 +1080,6 @@
dsp->intra_predictors[kTransformSize8x32][kIntraPredictorPaeth] =
DefsHbd::_8x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
- DefsHbd::_8x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_8x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_8x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorDcFill] =
Defs10bpp::_16x4::DcFill;
@@ -2162,19 +1108,6 @@
dsp->intra_predictors[kTransformSize16x4][kIntraPredictorPaeth] =
DefsHbd::_16x4::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
- DefsHbd::_16x4::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x4::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x4::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorDcFill] =
Defs10bpp::_16x8::DcFill;
@@ -2203,19 +1136,6 @@
dsp->intra_predictors[kTransformSize16x8][kIntraPredictorPaeth] =
DefsHbd::_16x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
- DefsHbd::_16x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorDcFill] =
Defs10bpp::_16x16::DcFill;
@@ -2244,19 +1164,6 @@
dsp->intra_predictors[kTransformSize16x16][kIntraPredictorPaeth] =
DefsHbd::_16x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
- DefsHbd::_16x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorDcFill] =
Defs10bpp::_16x32::DcFill;
@@ -2285,19 +1192,6 @@
dsp->intra_predictors[kTransformSize16x32][kIntraPredictorPaeth] =
DefsHbd::_16x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
- DefsHbd::_16x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorDcFill] =
Defs10bpp::_16x64::DcFill;
@@ -2326,19 +1220,6 @@
dsp->intra_predictors[kTransformSize16x64][kIntraPredictorPaeth] =
DefsHbd::_16x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
- DefsHbd::_16x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_16x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_16x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorDcFill] =
Defs10bpp::_32x8::DcFill;
@@ -2367,19 +1248,6 @@
dsp->intra_predictors[kTransformSize32x8][kIntraPredictorPaeth] =
DefsHbd::_32x8::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
- DefsHbd::_32x8::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x8::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x8::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorDcFill] =
Defs10bpp::_32x16::DcFill;
@@ -2408,19 +1276,6 @@
dsp->intra_predictors[kTransformSize32x16][kIntraPredictorPaeth] =
DefsHbd::_32x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
- DefsHbd::_32x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorDcFill] =
Defs10bpp::_32x32::DcFill;
@@ -2449,19 +1304,6 @@
dsp->intra_predictors[kTransformSize32x32][kIntraPredictorPaeth] =
DefsHbd::_32x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
- DefsHbd::_32x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorDcFill] =
Defs10bpp::_32x64::DcFill;
@@ -2490,19 +1332,6 @@
dsp->intra_predictors[kTransformSize32x64][kIntraPredictorPaeth] =
DefsHbd::_32x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
- DefsHbd::_32x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_32x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_32x64::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorDcFill] =
Defs10bpp::_64x16::DcFill;
@@ -2531,19 +1360,6 @@
dsp->intra_predictors[kTransformSize64x16][kIntraPredictorPaeth] =
DefsHbd::_64x16::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
- DefsHbd::_64x16::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x16::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x16::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorDcFill] =
Defs10bpp::_64x32::DcFill;
@@ -2572,19 +1388,6 @@
dsp->intra_predictors[kTransformSize64x32][kIntraPredictorPaeth] =
DefsHbd::_64x32::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
- DefsHbd::_64x32::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x32::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x32::SmoothHorizontal;
-#endif
-
#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorDcFill
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorDcFill] =
Defs10bpp::_64x64::DcFill;
@@ -2613,291 +1416,12 @@
dsp->intra_predictors[kTransformSize64x64][kIntraPredictorPaeth] =
DefsHbd::_64x64::Paeth;
#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
- DefsHbd::_64x64::Smooth;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
- DefsHbd::_64x64::SmoothVertical;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
- dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
- DefsHbd::_64x64::SmoothHorizontal;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_C<uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_C<uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
- dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x4] =
- CflIntraPredictor_C<4, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
- CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
- CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
- CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x8] =
- CflIntraPredictor_C<4, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
- CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
- CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
- CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize4x16] =
- CflIntraPredictor_C<4, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
- CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
- CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
- CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x4] =
- CflIntraPredictor_C<8, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
- CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
- CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
- CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x8] =
- CflIntraPredictor_C<8, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
- CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
- CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
- CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x16] =
- CflIntraPredictor_C<8, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
- CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
- CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
- CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize8x32] =
- CflIntraPredictor_C<8, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
- CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
- CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
- CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x4] =
- CflIntraPredictor_C<16, 4, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
- CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
- CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
- CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x8] =
- CflIntraPredictor_C<16, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
- CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
- CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
- CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x16] =
- CflIntraPredictor_C<16, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
- CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
- CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
- CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize16x32] =
- CflIntraPredictor_C<16, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
- CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
- CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
- CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x8] =
- CflIntraPredictor_C<32, 8, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
- CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
- CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
- CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x16] =
- CflIntraPredictor_C<32, 16, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
- CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
- CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
- CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
-#endif
-
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
- dsp->cfl_intra_predictors[kTransformSize32x32] =
- CflIntraPredictor_C<32, 32, 10, uint16_t>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
- CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
- CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
-#endif
-#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
- dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
- CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
-#endif
-
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- // Cfl predictors are available only for transform sizes with max(width,
- // height) <= 32. Set all others to nullptr.
- for (const auto i : kTransformSizesLargerThan32x32) {
- dsp->cfl_intra_predictors[i] = nullptr;
- for (int j = 0; j < kNumSubsamplingTypes; ++j) {
- dsp->cfl_subsamplers[i][j] = nullptr;
- }
- }
} // NOLINT(readability/fn_size)
#endif // LIBGAV1_MAX_BITDEPTH >= 10
-#undef INIT_CFL_INTRAPREDICTOR_WxH
-#undef INIT_CFL_INTRAPREDICTORS
#undef INIT_INTRAPREDICTORS_WxH
#undef INIT_INTRAPREDICTORS
-
} // namespace
void IntraPredInit_C() {
diff --git a/libgav1/src/dsp/intrapred.h b/libgav1/src/dsp/intrapred.h
index c5286ef..2cb625d 100644
--- a/libgav1/src/dsp/intrapred.h
+++ b/libgav1/src/dsp/intrapred.h
@@ -38,9 +38,7 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor. This function is not thread-safe.
+// Initializes Dsp::intra_predictors. This function is not thread-safe.
void IntraPredInit_C();
} // namespace dsp
diff --git a/libgav1/src/dsp/intrapred_cfl.cc b/libgav1/src/dsp/intrapred_cfl.cc
new file mode 100644
index 0000000..948c0c0
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_cfl.cc
@@ -0,0 +1,654 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_cfl.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+constexpr TransformSize kTransformSizesLargerThan32x32[] = {
+ kTransformSize16x64, kTransformSize32x64, kTransformSize64x16,
+ kTransformSize64x32, kTransformSize64x64};
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_C
+
+// |luma| can be within +/-(((1 << bitdepth) - 1) << 3), inclusive.
+// |alpha| can be -16 to 16 (inclusive).
+template <int block_width, int block_height, int bitdepth, typename Pixel>
+void CflIntraPredictor_C(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ auto* dst = static_cast<Pixel*>(dest);
+ const int dc = dst[0];
+ stride /= sizeof(Pixel);
+ const int max_value = (1 << bitdepth) - 1;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(luma[y][x] >= -(((1 << bitdepth) - 1) << 3));
+ assert(luma[y][x] <= ((1 << bitdepth) - 1) << 3);
+ dst[x] = Clip3(dc + RightShiftWithRoundingSigned(alpha * luma[y][x], 6),
+ 0, max_value);
+ }
+ dst += stride;
+ }
+}
+
+//------------------------------------------------------------------------------
+// CflSubsampler_C
+
+template <int block_width, int block_height, int bitdepth, typename Pixel,
+ int subsampling_x, int subsampling_y>
+void CflSubsampler_C(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const auto* src = static_cast<const Pixel*>(source);
+ stride /= sizeof(Pixel);
+ int sum = 0;
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ const ptrdiff_t luma_x =
+ std::min(x << subsampling_x, max_luma_width - (1 << subsampling_x));
+ const ptrdiff_t luma_x_next = luma_x + stride;
+ luma[y][x] =
+ (src[luma_x] + ((subsampling_x != 0) ? src[luma_x + 1] : 0) +
+ ((subsampling_y != 0) ? (src[luma_x_next] + src[luma_x_next + 1])
+ : 0))
+ << (3 - subsampling_x - subsampling_y);
+ sum += luma[y][x];
+ }
+ if ((y << subsampling_y) < (max_luma_height - (1 << subsampling_y))) {
+ src += stride << subsampling_y;
+ }
+ }
+ const int average = RightShiftWithRounding(
+ sum, FloorLog2(block_width) + FloorLog2(block_height));
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ luma[y][x] -= average;
+ }
+ }
+}
+
+//------------------------------------------------------------------------------
+
+// Initializes dsp entries for kTransformSize|W|x|H|.
+#define INIT_CFL_INTRAPREDICTOR_WxH(W, H, BITDEPTH, PIXEL) \
+ dsp->cfl_intra_predictors[kTransformSize##W##x##H] = \
+ CflIntraPredictor_C<W, H, BITDEPTH, PIXEL>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType444] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 0, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType422] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 0>; \
+ dsp->cfl_subsamplers[kTransformSize##W##x##H][kSubsamplingType420] = \
+ CflSubsampler_C<W, H, BITDEPTH, PIXEL, 1, 1>
+
+#define INIT_CFL_INTRAPREDICTORS(BITDEPTH, PIXEL) \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(4, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(8, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 4, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(16, 32, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 8, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 16, BITDEPTH, PIXEL); \
+ INIT_CFL_INTRAPREDICTOR_WxH(32, 32, BITDEPTH, PIXEL)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(8, uint8_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 8, uint8_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 8, uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 8, uint8_t, 1, 1>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_CFL_INTRAPREDICTORS(10, uint16_t);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_C<4, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType422] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler_C<4, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_C<4, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType422] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler_C<4, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_C<4, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType422] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler_C<4, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_C<8, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType422] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler_C<8, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_C<8, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType422] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler_C<8, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_C<8, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType422] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler_C<8, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_C<8, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType422] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler_C<8, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_C<16, 4, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType422] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler_C<16, 4, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_C<16, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType422] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler_C<16, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_C<16, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType422] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler_C<16, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_C<16, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType422] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler_C<16, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_C<32, 8, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType422] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler_C<32, 8, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_C<32, 16, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType422] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler_C<32, 16, 10, uint16_t, 1, 1>;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_C<32, 32, 10, uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 0, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler422
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType422] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 0>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler_C<32, 32, 10, uint16_t, 1, 1>;
+#endif
+
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ // Cfl predictors are available only for transform sizes with max(width,
+ // height) <= 32. Set all others to nullptr.
+ for (const auto i : kTransformSizesLargerThan32x32) {
+ dsp->cfl_intra_predictors[i] = nullptr;
+ for (int j = 0; j < kNumSubsamplingTypes; ++j) {
+ dsp->cfl_subsamplers[i][j] = nullptr;
+ }
+ }
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_CFL_INTRAPREDICTOR_WxH
+#undef INIT_CFL_INTRAPREDICTORS
+
+} // namespace
+
+void IntraPredCflInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/libgav1/src/dsp/intrapred_cfl.h b/libgav1/src/dsp/intrapred_cfl.h
new file mode 100644
index 0000000..4e8a11f
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_cfl.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_cfl_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_cfl_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers.
+// This function is not thread-safe.
+void IntraPredCflInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_CFL_H_
diff --git a/libgav1/src/dsp/intrapred_directional.cc b/libgav1/src/dsp/intrapred_directional.cc
new file mode 100644
index 0000000..e670769
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_directional.cc
@@ -0,0 +1,252 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone1_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+
+ // If xstep == 64 then |shift| always evaluates to 0 which sets |val| to
+ // |top[top_base_x]|. This corresponds to a 45 degree prediction.
+ if (xstep == 64) {
+ // 7.11.2.10. Intra edge upsample selection process
+ // if ( d <= 0 || d >= 40 ) useUpsample = 0
+ // For |upsampled_top| the delta is |predictor_angle - 90|. Since the
+ // |predictor_angle| is 45 the delta is also 45.
+ assert(!upsampled_top);
+ const Pixel* top_ptr = top + 1;
+ for (int y = 0; y < height; ++y, dst += stride, ++top_ptr) {
+ memcpy(dst, top_ptr, sizeof(*top_ptr) * width);
+ }
+ return;
+ }
+
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ Memset(dst, top[max_base_x], width);
+ dst += stride;
+ }
+ return;
+ }
+
+ const int shift = ((top_x << upsample_shift) & 0x3F) >> 1;
+ int x = 0;
+ do {
+ if (top_base_x >= max_base_x) {
+ Memset(dst + x, top[max_base_x], width - x);
+ break;
+ }
+
+ const int val =
+ top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5 /*log2(32)*/);
+ top_base_x += base_step;
+ } while (++x < width);
+
+ dst += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone2_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ assert(xstep > 0);
+ assert(ystep > 0);
+
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int scale_bits_x = 6 - upsample_top_shift;
+ const int scale_bits_y = 6 - upsample_left_shift;
+ const int min_base_x = -(1 << upsample_top_shift);
+ const int base_step_x = 1 << upsample_top_shift;
+ int y = 0;
+ int top_x = -xstep;
+ do {
+ int top_base_x = top_x >> scale_bits_x;
+ int left_y = (y << 6) - ystep;
+ int x = 0;
+ do {
+ int val;
+ if (top_base_x >= min_base_x) {
+ const int shift = ((top_x * (1 << upsample_top_shift)) & 0x3F) >> 1;
+ val = top[top_base_x] * (32 - shift) + top[top_base_x + 1] * shift;
+ } else {
+ // Note this assumes an arithmetic shift to handle negative values.
+ const int left_base_y = left_y >> scale_bits_y;
+ const int shift = ((left_y * (1 << upsample_left_shift)) & 0x3F) >> 1;
+ assert(left_base_y >= -(1 << upsample_left_shift));
+ val = left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ }
+ dst[x] = RightShiftWithRounding(val, 5);
+ top_base_x += base_step_x;
+ left_y -= ystep;
+ } while (++x < width);
+
+ top_x -= xstep;
+ dst += stride;
+ } while (++y < height);
+}
+
+template <typename Pixel>
+void DirectionalIntraPredictorZone3_C(void* const dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled_left) {
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ stride /= sizeof(Pixel);
+
+ assert(ystep > 0);
+
+ const int upsample_shift = static_cast<int>(upsampled_left);
+ const int scale_bits = 6 - upsample_shift;
+ const int base_step = 1 << upsample_shift;
+ // Zone3 never runs out of left_column values.
+ assert((width + height - 1) << upsample_shift > // max_base_y
+ ((ystep * width) >> scale_bits) +
+ base_step * (height - 1)); // left_base_y
+
+ int left_y = ystep;
+ int x = 0;
+ do {
+ auto* dst = static_cast<Pixel*>(dest);
+
+ int left_base_y = left_y >> scale_bits;
+ int y = 0;
+ do {
+ const int shift = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const int val =
+ left[left_base_y] * (32 - shift) + left[left_base_y + 1] * shift;
+ dst[x] = RightShiftWithRounding(val, 5);
+ dst += stride;
+ left_base_y += base_step;
+ } while (++y < height);
+
+ left_y += ystep;
+ } while (++x < width);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint8_t>;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone2
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_C<uint16_t>;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone3
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_C<uint16_t>;
+#endif
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void IntraPredDirectionalInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/libgav1/src/dsp/intrapred_directional.h b/libgav1/src/dsp/intrapred_directional.h
new file mode 100644
index 0000000..bcd1bc1
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_directional.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_directional_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_directional_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*. This function is not
+// thread-safe.
+void IntraPredDirectionalInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_DIRECTIONAL_H_
diff --git a/libgav1/src/dsp/intrapred_filter.cc b/libgav1/src/dsp/intrapred_filter.cc
new file mode 100644
index 0000000..f4bd296
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_filter.cc
@@ -0,0 +1,144 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_C
+
+// The recursive filter applies a different filter to the top 4 and 2 left
+// pixels to produce each pixel in a 4x2 sub-block. Each successive 4x2 uses the
+// prediction output of the blocks above and to the left, unless they are
+// adjacent to the |top_row| or |left_column|. The set of 8 filters is selected
+// according to |pred|.
+template <int bitdepth, typename Pixel>
+void FilterIntraPredictor_C(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const FilterIntraPredictor pred, const int width,
+ const int height) {
+ const int kMaxPixel = (1 << bitdepth) - 1;
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+
+ assert(width <= 32 && height <= 32);
+
+ Pixel buffer[3][33]; // cache 2 rows + top & left boundaries
+ memcpy(buffer[0], &top[-1], (width + 1) * sizeof(top[0]));
+
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+ int row0 = 0, row2 = 2;
+ int ystep = 1;
+ int y = 0;
+ do {
+ buffer[1][0] = left[y];
+ buffer[row2][0] = left[y + 1];
+ int x = 1;
+ do {
+ const Pixel p0 = buffer[row0][x - 1]; // top-left
+ const Pixel p1 = buffer[row0][x + 0]; // top 0
+ const Pixel p2 = buffer[row0][x + 1]; // top 1
+ const Pixel p3 = buffer[row0][x + 2]; // top 2
+ const Pixel p4 = buffer[row0][x + 3]; // top 3
+ const Pixel p5 = buffer[1][x - 1]; // left 0
+ const Pixel p6 = buffer[row2][x - 1]; // left 1
+ for (int i = 0; i < 8; ++i) {
+ const int xoffset = i & 0x03;
+ const int yoffset = (i >> 2) * ystep;
+ const int value = kFilterIntraTaps[pred][i][0] * p0 +
+ kFilterIntraTaps[pred][i][1] * p1 +
+ kFilterIntraTaps[pred][i][2] * p2 +
+ kFilterIntraTaps[pred][i][3] * p3 +
+ kFilterIntraTaps[pred][i][4] * p4 +
+ kFilterIntraTaps[pred][i][5] * p5 +
+ kFilterIntraTaps[pred][i][6] * p6;
+ // Section 7.11.2.3 specifies the right-hand side of the assignment as
+ // Clip1( Round2Signed( pr, INTRA_FILTER_SCALE_BITS ) ).
+ // Since Clip1() clips a negative value to 0, it is safe to replace
+ // Round2Signed() with Round2().
+ buffer[1 + yoffset][x + xoffset] = static_cast<Pixel>(
+ Clip3(RightShiftWithRounding(value, 4), 0, kMaxPixel));
+ }
+ x += 4;
+ } while (x < width);
+ memcpy(dst, &buffer[1][1], width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, &buffer[row2][1], width * sizeof(dst[0]));
+ dst += stride;
+
+ // The final row becomes the top for the next pass.
+ row0 ^= 2;
+ row2 ^= 2;
+ ystep = -ystep;
+ y += 2;
+ } while (y < height);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<8, uint8_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_FilterIntraPredictor
+ dsp->filter_intra_predictor = FilterIntraPredictor_C<10, uint16_t>;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace
+
+void IntraPredFilterInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/libgav1/src/dsp/intrapred_filter.h b/libgav1/src/dsp/intrapred_filter.h
new file mode 100644
index 0000000..8146b82
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_filter.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_filter_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_filter_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
+// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
+// Dsp::filter_intra_predictor. This function is not thread-safe.
+void IntraPredFilterInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_FILTER_H_
diff --git a/libgav1/src/dsp/intrapred_smooth.cc b/libgav1/src/dsp/intrapred_smooth.cc
new file mode 100644
index 0000000..83c005e
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_smooth.cc
@@ -0,0 +1,738 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_smooth.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+template <int block_width, int block_height, typename Pixel>
+struct SmoothFuncs_C {
+ SmoothFuncs_C() = delete;
+
+ static void Smooth(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothVertical(void* dest, ptrdiff_t stride, const void* top_row,
+ const void* left_column);
+ static void SmoothHorizontal(void* dest, ptrdiff_t stride,
+ const void* top_row, const void* left_column);
+};
+
+constexpr uint8_t kSmoothWeights[] = {
+ // block dimension = 4
+ 255, 149, 85, 64,
+ // block dimension = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // block dimension = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // block dimension = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // block dimension = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4};
+
+// SmoothFuncs_C::Smooth
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::Smooth(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(
+ block_width >= 4 && block_height >= 4,
+ "Weights for smooth predictor undefined for block width/height < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y] && scale_value >= weights_x[x]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ // The maximum value of pred with the rounder is 2^9 * (2^bitdepth - 1)
+ // + 256. With the descale there's no need for saturation.
+ dst[x] = static_cast<Pixel>(
+ RightShiftWithRounding(pred, kSmoothWeightScale + 1));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothVertical
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothVertical(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel bottom_left = left[block_height - 1];
+ static_assert(block_height >= 4,
+ "Weights for smooth predictor undefined for block height < 4");
+ const uint8_t* const weights_y = kSmoothWeights + block_height - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_y[y]);
+ uint32_t pred = weights_y[y] * top[x];
+ pred += static_cast<uint8_t>(scale_value - weights_y[y]) * bottom_left;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// SmoothFuncs_C::SmoothHorizontal
+template <int block_width, int block_height, typename Pixel>
+void SmoothFuncs_C<block_width, block_height, Pixel>::SmoothHorizontal(
+ void* const dest, ptrdiff_t stride, const void* const top_row,
+ const void* const left_column) {
+ const auto* const top = static_cast<const Pixel*>(top_row);
+ const auto* const left = static_cast<const Pixel*>(left_column);
+ const Pixel top_right = top[block_width - 1];
+ static_assert(block_width >= 4,
+ "Weights for smooth predictor undefined for block width < 4");
+ const uint8_t* const weights_x = kSmoothWeights + block_width - 4;
+ const uint16_t scale_value = (1 << kSmoothWeightScale);
+ auto* dst = static_cast<Pixel*>(dest);
+ stride /= sizeof(Pixel);
+
+ for (int y = 0; y < block_height; ++y) {
+ for (int x = 0; x < block_width; ++x) {
+ assert(scale_value >= weights_x[x]);
+ uint32_t pred = weights_x[x] * left[y];
+ pred += static_cast<uint8_t>(scale_value - weights_x[x]) * top_right;
+ dst[x] =
+ static_cast<Pixel>(RightShiftWithRounding(pred, kSmoothWeightScale));
+ }
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+
+template <typename Pixel>
+struct SmoothDefs {
+ SmoothDefs() = delete;
+
+ using _4x4 = SmoothFuncs_C<4, 4, Pixel>;
+ using _4x8 = SmoothFuncs_C<4, 8, Pixel>;
+ using _4x16 = SmoothFuncs_C<4, 16, Pixel>;
+ using _8x4 = SmoothFuncs_C<8, 4, Pixel>;
+ using _8x8 = SmoothFuncs_C<8, 8, Pixel>;
+ using _8x16 = SmoothFuncs_C<8, 16, Pixel>;
+ using _8x32 = SmoothFuncs_C<8, 32, Pixel>;
+ using _16x4 = SmoothFuncs_C<16, 4, Pixel>;
+ using _16x8 = SmoothFuncs_C<16, 8, Pixel>;
+ using _16x16 = SmoothFuncs_C<16, 16, Pixel>;
+ using _16x32 = SmoothFuncs_C<16, 32, Pixel>;
+ using _16x64 = SmoothFuncs_C<16, 64, Pixel>;
+ using _32x8 = SmoothFuncs_C<32, 8, Pixel>;
+ using _32x16 = SmoothFuncs_C<32, 16, Pixel>;
+ using _32x32 = SmoothFuncs_C<32, 32, Pixel>;
+ using _32x64 = SmoothFuncs_C<32, 64, Pixel>;
+ using _64x16 = SmoothFuncs_C<64, 16, Pixel>;
+ using _64x32 = SmoothFuncs_C<64, 32, Pixel>;
+ using _64x64 = SmoothFuncs_C<64, 64, Pixel>;
+};
+
+using Defs = SmoothDefs<uint8_t>;
+
+// Initializes dsp entries for kTransformSize|W|x|H| from |DEFS| of
+// the same size.
+#define INIT_SMOOTH_WxH(DEFS, W, H) \
+ dsp->intra_predictors[kTransformSize##W##x##H][kIntraPredictorSmooth] = \
+ DEFS::_##W##x##H::Smooth; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothVertical] = \
+ DEFS::_##W##x##H::SmoothVertical; \
+ dsp->intra_predictors[kTransformSize##W##x##H] \
+ [kIntraPredictorSmoothHorizontal] = \
+ DEFS::_##W##x##H::SmoothHorizontal
+
+#define INIT_SMOOTH(DEFS) \
+ INIT_SMOOTH_WxH(DEFS, 4, 4); \
+ INIT_SMOOTH_WxH(DEFS, 4, 8); \
+ INIT_SMOOTH_WxH(DEFS, 4, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 4); \
+ INIT_SMOOTH_WxH(DEFS, 8, 8); \
+ INIT_SMOOTH_WxH(DEFS, 8, 16); \
+ INIT_SMOOTH_WxH(DEFS, 8, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 4); \
+ INIT_SMOOTH_WxH(DEFS, 16, 8); \
+ INIT_SMOOTH_WxH(DEFS, 16, 16); \
+ INIT_SMOOTH_WxH(DEFS, 16, 32); \
+ INIT_SMOOTH_WxH(DEFS, 16, 64); \
+ INIT_SMOOTH_WxH(DEFS, 32, 8); \
+ INIT_SMOOTH_WxH(DEFS, 32, 16); \
+ INIT_SMOOTH_WxH(DEFS, 32, 32); \
+ INIT_SMOOTH_WxH(DEFS, 32, 64); \
+ INIT_SMOOTH_WxH(DEFS, 64, 16); \
+ INIT_SMOOTH_WxH(DEFS, 64, 32); \
+ INIT_SMOOTH_WxH(DEFS, 64, 64)
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(Defs);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ Defs::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ Defs::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ Defs::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ Defs::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ Defs::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ Defs::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ Defs::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ Defs::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ Defs::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ Defs::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ Defs::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ Defs::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ Defs::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ Defs::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ Defs::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ Defs::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ Defs::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ Defs::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ Defs::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ Defs::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ Defs::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ Defs::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ Defs::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ Defs::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ Defs::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ Defs::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ Defs::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ Defs::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ Defs::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ Defs::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ Defs::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ Defs::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ Defs::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ Defs::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ Defs::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ Defs::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ Defs::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ Defs::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ Defs::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+using DefsHbd = SmoothDefs<uint16_t>;
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ INIT_SMOOTH(DefsHbd);
+#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+ static_cast<void>(dsp);
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmooth] =
+ DefsHbd::_4x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmooth] =
+ DefsHbd::_4x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmooth] =
+ DefsHbd::_4x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_4x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize4x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_4x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmooth] =
+ DefsHbd::_8x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmooth] =
+ DefsHbd::_8x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmooth] =
+ DefsHbd::_8x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmooth] =
+ DefsHbd::_8x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_8x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize8x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_8x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmooth] =
+ DefsHbd::_16x4::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x4::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x4][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x4::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmooth] =
+ DefsHbd::_16x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmooth] =
+ DefsHbd::_16x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmooth] =
+ DefsHbd::_16x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmooth] =
+ DefsHbd::_16x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_16x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize16x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_16x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmooth] =
+ DefsHbd::_32x8::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x8::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x8][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x8::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmooth] =
+ DefsHbd::_32x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmooth] =
+ DefsHbd::_32x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmooth] =
+ DefsHbd::_32x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_32x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize32x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_32x64::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmooth] =
+ DefsHbd::_64x16::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x16::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x16][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x16::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmooth] =
+ DefsHbd::_64x32::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x32::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x32][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x32::SmoothHorizontal;
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmooth
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmooth] =
+ DefsHbd::_64x64::Smooth;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothVertical
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothVertical] =
+ DefsHbd::_64x64::SmoothVertical;
+#endif
+#ifndef LIBGAV1_Dsp10bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+ dsp->intra_predictors[kTransformSize64x64][kIntraPredictorSmoothHorizontal] =
+ DefsHbd::_64x64::SmoothHorizontal;
+#endif
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+} // NOLINT(readability/fn_size)
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+#undef INIT_SMOOTH_WxH
+#undef INIT_SMOOTH
+} // namespace
+
+void IntraPredSmoothInit_C() {
+ Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
diff --git a/libgav1/src/dsp/intrapred_smooth.h b/libgav1/src/dsp/intrapred_smooth.h
new file mode 100644
index 0000000..6802003
--- /dev/null
+++ b/libgav1/src/dsp/intrapred_smooth.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+#define LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
+
+// Pull in LIBGAV1_DspXXX defines representing the implementation status
+// of each function. The resulting value of each can be used by each module to
+// determine whether an implementation is needed at compile time.
+// IWYU pragma: begin_exports
+
+// ARM:
+#include "src/dsp/arm/intrapred_smooth_neon.h"
+
+// x86:
+// Note includes should be sorted in logical order avx2/avx/sse4, etc.
+// The order of includes is important as each tests for a superior version
+// before setting the base.
+// clang-format off
+#include "src/dsp/x86/intrapred_smooth_sse4.h"
+// clang-format on
+
+// IWYU pragma: end_exports
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_C();
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_SRC_DSP_INTRAPRED_SMOOTH_H_
diff --git a/libgav1/src/dsp/inverse_transform.cc b/libgav1/src/dsp/inverse_transform.cc
index 1c5a4a6..ed984d8 100644
--- a/libgav1/src/dsp/inverse_transform.cc
+++ b/libgav1/src/dsp/inverse_transform.cc
@@ -161,16 +161,13 @@
3, 35, 19, 51, 11, 43, 27, 59, 7, 39, 23, 55, 15, 47, 31, 63}};
template <typename Residual, int size_log2>
-void Dct_C(void* dest, const void* source, int8_t range) {
+void Dct_C(void* dest, int8_t range) {
static_assert(size_log2 >= 2 && size_log2 <= 6, "");
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// stage 1.
const int size = 1 << size_log2;
- // The copy is necessary because |dst| and |src| could be pointing to the same
- // buffer.
Residual temp[size];
- memcpy(temp, src, sizeof(temp));
+ memcpy(temp, dst, sizeof(temp));
for (int i = 0; i < size; ++i) {
dst[i] = temp[kBitReverseLookup[size_log2 - 2][i]];
}
@@ -266,7 +263,7 @@
if (size_log2 >= 3) {
for (int i = 0; i < 2; ++i) {
HadamardRotation_C(dst, MultiplyBy2(i) + 4, MultiplyBy2(i) + 5,
- static_cast<bool>(i), range);
+ /*flip=*/i != 0, range);
}
}
// stage 14.
@@ -308,7 +305,7 @@
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 2; ++j) {
HadamardRotation_C(dst, MultiplyBy4(i) + j + 8, MultiplyBy4(i) - j + 11,
- static_cast<bool>(i), range);
+ /*flip=*/i != 0, range);
}
}
}
@@ -396,12 +393,10 @@
}
template <int bitdepth, typename Residual, int size_log2>
-void DctDcOnly_C(void* dest, const void* source, int8_t range,
- bool should_round, int row_shift, bool is_row) {
+void DctDcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
- dst[0] = src[0];
if (is_row && should_round) {
dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
@@ -428,11 +423,9 @@
* Column transform max range in bits for bitdepths 8/10/12: 28/28/30.
*/
template <typename Residual>
-void Adst4_C(void* dest, const void* source, int8_t range) {
+void Adst4_C(void* dest, int8_t range) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
- if ((src[0] | src[1] | src[2] | src[3]) == 0) {
- memset(dst, 0, 4 * sizeof(dst[0]));
+ if ((dst[0] | dst[1] | dst[2] | dst[3]) == 0) {
return;
}
@@ -441,22 +434,22 @@
// values stored in the s and x arrays by this process are representable by
// a signed integer using range + 12 bits of precision.
int32_t s[7];
- s[0] = RangeCheckValue(kAdst4Multiplier[0] * src[0], range + 12);
- s[1] = RangeCheckValue(kAdst4Multiplier[1] * src[0], range + 12);
- s[2] = RangeCheckValue(kAdst4Multiplier[2] * src[1], range + 12);
- s[3] = RangeCheckValue(kAdst4Multiplier[3] * src[2], range + 12);
- s[4] = RangeCheckValue(kAdst4Multiplier[0] * src[2], range + 12);
- s[5] = RangeCheckValue(kAdst4Multiplier[1] * src[3], range + 12);
- s[6] = RangeCheckValue(kAdst4Multiplier[3] * src[3], range + 12);
+ s[0] = RangeCheckValue(kAdst4Multiplier[0] * dst[0], range + 12);
+ s[1] = RangeCheckValue(kAdst4Multiplier[1] * dst[0], range + 12);
+ s[2] = RangeCheckValue(kAdst4Multiplier[2] * dst[1], range + 12);
+ s[3] = RangeCheckValue(kAdst4Multiplier[3] * dst[2], range + 12);
+ s[4] = RangeCheckValue(kAdst4Multiplier[0] * dst[2], range + 12);
+ s[5] = RangeCheckValue(kAdst4Multiplier[1] * dst[3], range + 12);
+ s[6] = RangeCheckValue(kAdst4Multiplier[3] * dst[3], range + 12);
// stage 2.
// Section 7.13.2.6: It is a requirement of bitstream conformance that
// values stored in the variable a7 by this process are representable by a
// signed integer using range + 1 bits of precision.
- const int32_t a7 = RangeCheckValue(src[0] - src[2], range + 1);
+ const int32_t a7 = RangeCheckValue(dst[0] - dst[2], range + 1);
// Section 7.13.2.6: It is a requirement of bitstream conformance that
// values stored in the variable b7 by this process are representable by a
// signed integer using |range| bits of precision.
- const int32_t b7 = RangeCheckValue(a7 + src[3], range);
+ const int32_t b7 = RangeCheckValue(a7 + dst[3], range);
// stage 3.
s[0] = RangeCheckValue(s[0] + s[3], range + 12);
s[1] = RangeCheckValue(s[1] - s[4], range + 12);
@@ -490,14 +483,12 @@
}
template <int bitdepth, typename Residual>
-void Adst4DcOnly_C(void* dest, const void* source, int8_t range,
- bool should_round, int row_shift, bool is_row) {
+void Adst4DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
- dst[0] = src[0];
if (is_row && should_round) {
- dst[0] = RightShiftWithRounding(src[0] * kTransformRowMultiplier, 12);
+ dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
// stage 1.
@@ -570,12 +561,11 @@
}
template <typename Residual>
-void Adst8_C(void* dest, const void* source, int8_t range) {
+void Adst8_C(void* dest, int8_t range) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// stage 1.
int32_t temp[8];
- AdstInputPermutation(temp, src, 8);
+ AdstInputPermutation(temp, dst, 8);
// stage 2.
for (int i = 0; i < 4; ++i) {
ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 60 - 16 * i,
@@ -606,15 +596,14 @@
}
template <int bitdepth, typename Residual>
-void Adst8DcOnly_C(void* dest, const void* source, int8_t range,
- bool should_round, int row_shift, bool is_row) {
+void Adst8DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// stage 1.
int32_t temp[8];
// After the permutation, the dc value is in temp[1]. The remaining are zero.
- AdstInputPermutation(temp, src, 8);
+ AdstInputPermutation(temp, dst, 8);
if (is_row && should_round) {
temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
@@ -654,12 +643,11 @@
}
template <typename Residual>
-void Adst16_C(void* dest, const void* source, int8_t range) {
+void Adst16_C(void* dest, int8_t range) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// stage 1.
int32_t temp[16];
- AdstInputPermutation(temp, src, 16);
+ AdstInputPermutation(temp, dst, 16);
// stage 2.
for (int i = 0; i < 8; ++i) {
ButterflyRotation_C(temp, MultiplyBy2(i), MultiplyBy2(i) + 1, 62 - 8 * i,
@@ -707,15 +695,14 @@
}
template <int bitdepth, typename Residual>
-void Adst16DcOnly_C(void* dest, const void* source, int8_t range,
- bool should_round, int row_shift, bool is_row) {
+void Adst16DcOnly_C(void* dest, int8_t range, bool should_round, int row_shift,
+ bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// stage 1.
int32_t temp[16];
// After the permutation, the dc value is in temp[1]. The remaining are zero.
- AdstInputPermutation(temp, src, 16);
+ AdstInputPermutation(temp, dst, 16);
if (is_row && should_round) {
temp[1] = RightShiftWithRounding(temp[1] * kTransformRowMultiplier, 12);
@@ -798,7 +785,7 @@
// optimized.
//
// The identity transform functions have the following prototype:
-// void Identity_C(void* dest, const void* source, int8_t shift);
+// void Identity_C(void* dest, int8_t shift);
//
// The |shift| parameter is the amount of shift for the Round2() call. For row
// transforms, |shift| is 0, 1, or 2. For column transforms, |shift| is always
@@ -852,10 +839,9 @@
// 4 (2 bits) and |shift| is always 4.
template <typename Residual>
-void Identity4Row_C(void* dest, const void* source, int8_t shift) {
+void Identity4Row_C(void* dest, int8_t shift) {
assert(shift == 0 || shift == 1);
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
// If |shift| is 0, |rounding| should be 1 << 11. If |shift| is 1, |rounding|
// should be (1 + (1 << 1)) << 11. The following expression works for both
// values of |shift|.
@@ -864,7 +850,7 @@
// The intermediate value here will have to fit into an int32_t for it to be
// bitstream conformant. The multiplication is promoted to int32_t by
// defining kIdentity4Multiplier as int32_t.
- int32_t dst_i = (src[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
+ int32_t dst_i = (dst[i] * kIdentity4Multiplier + rounding) >> (12 + shift);
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -873,27 +859,24 @@
}
template <typename Residual>
-void Identity4Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity4Column_C(void* dest, int8_t /*shift*/) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
for (int i = 0; i < 4; ++i) {
// The intermediate value here will have to fit into an int32_t for it to be
// bitstream conformant. The multiplication is promoted to int32_t by
// defining kIdentity4Multiplier as int32_t.
- dst[i] = static_cast<Residual>((src[i] * kIdentity4Multiplier + rounding) >>
+ dst[i] = static_cast<Residual>((dst[i] * kIdentity4Multiplier + rounding) >>
(12 + kTransformColumnShift));
}
}
template <int bitdepth, typename Residual>
-void Identity4DcOnly_C(void* dest, const void* source, int8_t /*range*/,
- bool should_round, int row_shift, bool is_row) {
+void Identity4DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
if (is_row) {
- dst[0] = src[0];
if (should_round) {
dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
@@ -911,17 +894,16 @@
}
const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
- dst[0] = static_cast<Residual>((src[0] * kIdentity4Multiplier + rounding) >>
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity4Multiplier + rounding) >>
(12 + kTransformColumnShift));
}
template <typename Residual>
-void Identity8Row_C(void* dest, const void* source, int8_t shift) {
+void Identity8Row_C(void* dest, int8_t shift) {
assert(shift == 0 || shift == 1 || shift == 2);
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
for (int i = 0; i < 8; ++i) {
- int32_t dst_i = RightShiftWithRounding(MultiplyBy2(src[i]), shift);
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy2(dst[i]), shift);
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -930,23 +912,20 @@
}
template <typename Residual>
-void Identity8Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity8Column_C(void* dest, int8_t /*shift*/) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
for (int i = 0; i < 8; ++i) {
dst[i] = static_cast<Residual>(
- RightShiftWithRounding(src[i], kTransformColumnShift - 1));
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 1));
}
}
template <int bitdepth, typename Residual>
-void Identity8DcOnly_C(void* dest, const void* source, int8_t /*range*/,
- bool should_round, int row_shift, bool is_row) {
+void Identity8DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
if (is_row) {
- dst[0] = src[0];
if (should_round) {
dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
@@ -969,20 +948,19 @@
}
dst[0] = static_cast<Residual>(
- RightShiftWithRounding(src[0], kTransformColumnShift - 1));
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 1));
}
template <typename Residual>
-void Identity16Row_C(void* dest, const void* source, int8_t shift) {
+void Identity16Row_C(void* dest, int8_t shift) {
assert(shift == 1 || shift == 2);
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
const int32_t rounding = (1 + (1 << shift)) << 11;
for (int i = 0; i < 16; ++i) {
// The intermediate value here will have to fit into an int32_t for it to be
// bitstream conformant. The multiplication is promoted to int32_t by
// defining kIdentity16Multiplier as int32_t.
- int32_t dst_i = (src[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
+ int32_t dst_i = (dst[i] * kIdentity16Multiplier + rounding) >> (12 + shift);
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -991,28 +969,25 @@
}
template <typename Residual>
-void Identity16Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity16Column_C(void* dest, int8_t /*shift*/) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
for (int i = 0; i < 16; ++i) {
// The intermediate value here will have to fit into an int32_t for it to be
// bitstream conformant. The multiplication is promoted to int32_t by
// defining kIdentity16Multiplier as int32_t.
dst[i] =
- static_cast<Residual>((src[i] * kIdentity16Multiplier + rounding) >>
+ static_cast<Residual>((dst[i] * kIdentity16Multiplier + rounding) >>
(12 + kTransformColumnShift));
}
}
template <int bitdepth, typename Residual>
-void Identity16DcOnly_C(void* dest, const void* source, int8_t /*range*/,
- bool should_round, int row_shift, bool is_row) {
+void Identity16DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
if (is_row) {
- dst[0] = src[0];
if (should_round) {
dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
@@ -1030,17 +1005,16 @@
}
const int32_t rounding = (1 + (1 << kTransformColumnShift)) << 11;
- dst[0] = static_cast<Residual>((src[0] * kIdentity16Multiplier + rounding) >>
+ dst[0] = static_cast<Residual>((dst[0] * kIdentity16Multiplier + rounding) >>
(12 + kTransformColumnShift));
}
template <typename Residual>
-void Identity32Row_C(void* dest, const void* source, int8_t shift) {
+void Identity32Row_C(void* dest, int8_t shift) {
assert(shift == 1 || shift == 2);
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
for (int i = 0; i < 32; ++i) {
- int32_t dst_i = RightShiftWithRounding(MultiplyBy4(src[i]), shift);
+ int32_t dst_i = RightShiftWithRounding(MultiplyBy4(dst[i]), shift);
if (sizeof(Residual) == 2) {
dst_i = Clip3(dst_i, INT16_MIN, INT16_MAX);
}
@@ -1049,23 +1023,20 @@
}
template <typename Residual>
-void Identity32Column_C(void* dest, const void* source, int8_t /*shift*/) {
+void Identity32Column_C(void* dest, int8_t /*shift*/) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
for (int i = 0; i < 32; ++i) {
dst[i] = static_cast<Residual>(
- RightShiftWithRounding(src[i], kTransformColumnShift - 2));
+ RightShiftWithRounding(dst[i], kTransformColumnShift - 2));
}
}
template <int bitdepth, typename Residual>
-void Identity32DcOnly_C(void* dest, const void* source, int8_t /*range*/,
- bool should_round, int row_shift, bool is_row) {
+void Identity32DcOnly_C(void* dest, int8_t /*range*/, bool should_round,
+ int row_shift, bool is_row) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
if (is_row) {
- dst[0] = src[0];
if (should_round) {
dst[0] = RightShiftWithRounding(dst[0] * kTransformRowMultiplier, 12);
}
@@ -1081,21 +1052,20 @@
}
dst[0] = static_cast<Residual>(
- RightShiftWithRounding(src[0], kTransformColumnShift - 2));
+ RightShiftWithRounding(dst[0], kTransformColumnShift - 2));
}
//------------------------------------------------------------------------------
// Walsh Hadamard Transform.
template <typename Residual>
-void Wht4_C(void* dest, const void* source, int8_t shift) {
+void Wht4_C(void* dest, int8_t shift) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
Residual temp[4];
- temp[0] = src[0] >> shift;
- temp[2] = src[1] >> shift;
- temp[3] = src[2] >> shift;
- temp[1] = src[3] >> shift;
+ temp[0] = dst[0] >> shift;
+ temp[2] = dst[1] >> shift;
+ temp[3] = dst[2] >> shift;
+ temp[1] = dst[3] >> shift;
temp[0] += temp[2];
temp[3] -= temp[1];
// This signed right shift must be an arithmetic shift.
@@ -1107,13 +1077,12 @@
}
template <int bitdepth, typename Residual>
-void Wht4DcOnly_C(void* dest, const void* source, int8_t range,
- bool /*should_round*/, int /*row_shift*/, bool /*is_row*/) {
+void Wht4DcOnly_C(void* dest, int8_t range, bool /*should_round*/,
+ int /*row_shift*/, bool /*is_row*/) {
auto* const dst = static_cast<Residual*>(dest);
- const auto* const src = static_cast<const Residual*>(source);
const int shift = range;
- Residual temp = src[0] >> shift;
+ Residual temp = dst[0] >> shift;
// This signed right shift must be an arithmetic shift.
Residual e = temp >> 1;
dst[0] = temp - e;
@@ -1127,20 +1096,18 @@
//------------------------------------------------------------------------------
// row/column transform loop
-using InverseTransform1DFunc = void (*)(void* dst, const void* src,
- int8_t range);
-using InverseTransformDcOnlyFunc = void (*)(void* dest, const void* source,
- int8_t range, bool should_round,
- int row_shift, bool is_row);
+using InverseTransform1DFunc = void (*)(void* dst, int8_t range);
+using InverseTransformDcOnlyFunc = void (*)(void* dest, int8_t range,
+ bool should_round, int row_shift,
+ bool is_row);
template <int bitdepth, typename Residual, typename Pixel,
Transform1D transform1d_type,
InverseTransformDcOnlyFunc dconly_transform1d,
- InverseTransform1DFunc row_transform1d_func,
- InverseTransform1DFunc column_transform1d_func = row_transform1d_func>
+ InverseTransform1DFunc transform1d_func, bool is_row>
void TransformLoop_C(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row, int non_zero_coeff_count) {
+ int adjusted_tx_height, void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
constexpr bool lossless = transform1d_type == k1DTransformWht;
constexpr bool is_identity = transform1d_type == k1DTransformIdentity;
// The transform size of the WHT is always 4x4. Setting tx_width and
@@ -1168,19 +1135,16 @@
// the fraction 2896 / 2^12.
const bool should_round = std::abs(tx_width_log2 - tx_height_log2) == 1;
- if (non_zero_coeff_count == 1) {
- dconly_transform1d(residual[0], residual[0], row_clamp_range,
- should_round, row_shift, true);
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(residual[0], row_clamp_range, should_round, row_shift,
+ true);
return;
}
// Row transforms need to be done only up to 32 because the rest of the rows
// are always all zero if |tx_height| is 64. Otherwise, only process the
// rows that have a non zero coefficients.
- // TODO(slavarnway): Expand to include other possible non_zero_coeff_count
- // values.
- const int num_rows = std::min(tx_height, 32);
- for (int i = 0; i < num_rows; ++i) {
+ for (int i = 0; i < adjusted_tx_height; ++i) {
// If lossless, the transform size is 4x4, so should_round is false.
if (!lossless && should_round) {
// The last 32 values of every row are always zero if the |tx_width| is
@@ -1190,10 +1154,9 @@
residual[i][j] * kTransformRowMultiplier, 12);
}
}
- // For identity transform, |row_transform1d_func| also performs the
+ // For identity transform, |transform1d_func| also performs the
// Round2(T[j], rowShift) call in the spec.
- row_transform1d_func(residual[i], residual[i],
- is_identity ? row_shift : row_clamp_range);
+ transform1d_func(residual[i], is_identity ? row_shift : row_clamp_range);
if (!lossless && !is_identity && row_shift > 0) {
for (int j = 0; j < tx_width; ++j) {
residual[i][j] = RightShiftWithRounding(residual[i][j], row_shift);
@@ -1221,17 +1184,17 @@
Residual tx_buffer[64];
for (int j = 0; j < tx_width; ++j) {
const int flipped_j = flip_columns ? tx_width - j - 1 : j;
- for (int i = 0; i < tx_height; ++i) {
+ int i = 0;
+ do {
tx_buffer[i] = residual[i][flipped_j];
- }
- if (non_zero_coeff_count == 1) {
- dconly_transform1d(tx_buffer, tx_buffer, column_clamp_range, false, 0,
- false);
+ } while (++i != tx_height);
+ if (adjusted_tx_height == 1) {
+ dconly_transform1d(tx_buffer, column_clamp_range, false, 0, false);
} else {
- // For identity transform, |column_transform1d_func| also performs the
+ // For identity transform, |transform1d_func| also performs the
// Round2(T[i], colShift) call in the spec.
- column_transform1d_func(tx_buffer, tx_buffer,
- is_identity ? column_shift : column_clamp_range);
+ transform1d_func(tx_buffer,
+ is_identity ? column_shift : column_clamp_range);
}
const int x = start_x + j;
for (int i = 0; i < tx_height; ++i) {
@@ -1249,139 +1212,264 @@
//------------------------------------------------------------------------------
+#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
template <int bitdepth, typename Residual, typename Pixel>
void InitAll(Dsp* const dsp) {
// Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
- DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>>;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
- DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>>;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+ DctDcOnly_C<bitdepth, Residual, 2>, Dct_C<Residual, 2>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
- DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>>;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
- DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>>;
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+ DctDcOnly_C<bitdepth, Residual, 3>, Dct_C<Residual, 3>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
- DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>>;
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 4>, Dct_C<Residual, 4>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 5>, Dct_C<Residual, 5>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformDct,
+ DctDcOnly_C<bitdepth, Residual, 6>, Dct_C<Residual, 6>,
+ /*is_row=*/false>;
// Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
- Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>>;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
- Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>>;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+ Adst4DcOnly_C<bitdepth, Residual>, Adst4_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
- Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>>;
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst8DcOnly_C<bitdepth, Residual>, Adst8_C<Residual>,
+ /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformAdst,
+ Adst16DcOnly_C<bitdepth, Residual>, Adst16_C<Residual>,
+ /*is_row=*/false>;
// Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
Identity4DcOnly_C<bitdepth, Residual>,
- Identity4Row_C<Residual>, Identity4Column_C<Residual>>;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+ Identity4Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity4DcOnly_C<bitdepth, Residual>,
+ Identity4Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
Identity8DcOnly_C<bitdepth, Residual>,
- Identity8Row_C<Residual>, Identity8Column_C<Residual>>;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+ Identity8Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity8DcOnly_C<bitdepth, Residual>,
+ Identity8Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
Identity16DcOnly_C<bitdepth, Residual>,
- Identity16Row_C<Residual>, Identity16Column_C<Residual>>;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+ Identity16Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity16DcOnly_C<bitdepth, Residual>,
+ Identity16Column_C<Residual>, /*is_row=*/false>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
Identity32DcOnly_C<bitdepth, Residual>,
- Identity32Row_C<Residual>, Identity32Column_C<Residual>>;
+ Identity32Row_C<Residual>, /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformIdentity,
+ Identity32DcOnly_C<bitdepth, Residual>,
+ Identity32Column_C<Residual>, /*is_row=*/false>;
// Maximum transform size for Wht is 4.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
- Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>>;
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<bitdepth, Residual, Pixel, k1DTransformWht,
+ Wht4DcOnly_C<bitdepth, Residual>, Wht4_C<Residual>,
+ /*is_row=*/false>;
}
+#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
for (auto& inverse_transform : inverse_transform_by_size) {
- inverse_transform = nullptr;
+ inverse_transform[kRow] = nullptr;
+ inverse_transform[kColumn] = nullptr;
}
}
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
InitAll<8, int16_t, uint8_t>(dsp);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
- DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>>;
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 2>, Dct_C<int16_t, 2>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
- DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>>;
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 3>, Dct_C<int16_t, 3>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
- DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>>;
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 4>, Dct_C<int16_t, 4>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
- DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>>;
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 5>, Dct_C<int16_t, 5>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize64_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
- DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>>;
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformDct,
+ DctDcOnly_C<8, int16_t, 6>, Dct_C<int16_t, 6>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
- Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>>;
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst4DcOnly_C<8, int16_t>, Adst4_C<int16_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
- Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>>;
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst8DcOnly_C<8, int16_t>, Adst8_C<int16_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
- Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>>;
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformAdst,
+ Adst16DcOnly_C<8, int16_t>, Adst16_C<int16_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
Identity4DcOnly_C<8, int16_t>, Identity4Row_C<int16_t>,
- Identity4Column_C<int16_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity4DcOnly_C<8, int16_t>, Identity4Column_C<int16_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize8_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
Identity8DcOnly_C<8, int16_t>, Identity8Row_C<int16_t>,
- Identity8Column_C<int16_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity8DcOnly_C<8, int16_t>, Identity8Column_C<int16_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize16_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
Identity16DcOnly_C<8, int16_t>, Identity16Row_C<int16_t>,
- Identity16Column_C<int16_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity16DcOnly_C<8, int16_t>,
+ Identity16Column_C<int16_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize32_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
Identity32DcOnly_C<8, int16_t>, Identity32Row_C<int16_t>,
- Identity32Column_C<int16_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformIdentity,
+ Identity32DcOnly_C<8, int16_t>,
+ Identity32Column_C<int16_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
- Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>>;
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<8, int16_t, uint8_t, k1DTransformWht,
+ Wht4DcOnly_C<8, int16_t>, Wht4_C<int16_t>,
+ /*is_row=*/false>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
@@ -1392,80 +1480,142 @@
assert(dsp != nullptr);
for (auto& inverse_transform_by_size : dsp->inverse_transforms) {
for (auto& inverse_transform : inverse_transform_by_size) {
- inverse_transform = nullptr;
+ inverse_transform[kRow] = nullptr;
+ inverse_transform[kColumn] = nullptr;
}
}
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
InitAll<10, int32_t, uint16_t>(dsp);
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
- DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>>;
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 2>, Dct_C<int32_t, 2>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
- DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>>;
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 3>, Dct_C<int32_t, 3>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
- DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>>;
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 4>, Dct_C<int32_t, 4>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
- DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>>;
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 5>, Dct_C<int32_t, 5>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize64_1DTransformDct
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
- DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>>;
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformDct,
+ DctDcOnly_C<10, int32_t, 6>, Dct_C<int32_t, 6>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
- Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>>;
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst4DcOnly_C<10, int32_t>, Adst4_C<int32_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
- Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>>;
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst8DcOnly_C<10, int32_t>, Adst8_C<int32_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformAdst
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
- Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>>;
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformAdst,
+ Adst16DcOnly_C<10, int32_t>, Adst16_C<int32_t>,
+ /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
Identity4DcOnly_C<10, int32_t>, Identity4Row_C<int32_t>,
- Identity4Column_C<int32_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity4DcOnly_C<10, int32_t>,
+ Identity4Column_C<int32_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize8_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
Identity8DcOnly_C<10, int32_t>, Identity8Row_C<int32_t>,
- Identity8Column_C<int32_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity8DcOnly_C<10, int32_t>,
+ Identity8Column_C<int32_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize16_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
Identity16DcOnly_C<10, int32_t>, Identity16Row_C<int32_t>,
- Identity16Column_C<int32_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity16DcOnly_C<10, int32_t>,
+ Identity16Column_C<int32_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize32_1DTransformIdentity
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
Identity32DcOnly_C<10, int32_t>, Identity32Row_C<int32_t>,
- Identity32Column_C<int32_t>>;
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformIdentity,
+ Identity32DcOnly_C<10, int32_t>,
+ Identity32Column_C<int32_t>, /*is_row=*/false>;
#endif
#ifndef LIBGAV1_Dsp10bpp_1DTransformSize4_1DTransformWht
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
- Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>>;
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/true>;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ TransformLoop_C<10, int32_t, uint16_t, k1DTransformWht,
+ Wht4DcOnly_C<10, int32_t>, Wht4_C<int32_t>,
+ /*is_row=*/false>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
diff --git a/libgav1/src/dsp/inverse_transform.inc b/libgav1/src/dsp/inverse_transform.inc
index 1893884..55e68b6 100644
--- a/libgav1/src/dsp/inverse_transform.inc
+++ b/libgav1/src/dsp/inverse_transform.inc
@@ -46,84 +46,6 @@
inline int16_t Sin128(int angle) { return Cos128(angle - 64); }
-template <int tx_width>
-LIBGAV1_ALWAYS_INLINE int GetNumRows(TransformType tx_type, int tx_height,
- int non_zero_coeff_count) {
- const TransformClass tx_class = GetTransformClass(tx_type);
- // The transform loops process either 4 or a multiple of 8 rows. Use tx_class
- // to determine the scan order. Then return the number of rows based on the
- // non_zero_coeff_count.
- if (tx_height > 4) {
- if (tx_class == kTransformClass2D) {
- if (tx_width == 4) {
- if (non_zero_coeff_count <= 10) return 4;
- if (non_zero_coeff_count <= 29) return 8;
- return tx_height;
- }
- if (tx_width == 8) {
- if (non_zero_coeff_count <= 10) return 4;
- if (non_zero_coeff_count <= 43) return 8;
- if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
- return tx_height;
- }
- if (tx_width == 16) {
- if (non_zero_coeff_count <= 10) return 4;
- if (non_zero_coeff_count <= 36) return 8;
- if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
- return tx_height;
- }
- if (tx_width == 32) {
- if (non_zero_coeff_count <= 10) return 4;
- if (non_zero_coeff_count <= 36) return 8;
- if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
- return tx_height;
- }
- }
-
- if (tx_class == kTransformClassHorizontal) {
- if (non_zero_coeff_count <= 4) return 4;
- if (non_zero_coeff_count <= 8) return 8;
- if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
- return tx_height;
- }
-
- if (tx_class == kTransformClassVertical) {
- if (tx_width == 4) {
- if (non_zero_coeff_count <= 16) return 4;
- if (non_zero_coeff_count <= 32) return 8;
- return tx_height;
- }
- if (tx_width == 8) {
- if (non_zero_coeff_count <= 32) return 4;
- if (non_zero_coeff_count <= 64) return 8;
- if ((non_zero_coeff_count <= 128) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 192) & (tx_height > 16)) return 24;
- return tx_height;
- }
-
- if (tx_width == 16) {
- if (non_zero_coeff_count <= 64) return 4;
- if (non_zero_coeff_count <= 128) return 8;
- if ((non_zero_coeff_count <= 256) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 384) & (tx_height > 16)) return 24;
- return tx_height;
- }
- if (tx_width == 32) {
- if (non_zero_coeff_count <= 128) return 4;
- if (non_zero_coeff_count <= 256) return 8;
- if ((non_zero_coeff_count <= 512) & (tx_height > 16)) return 16;
- if ((non_zero_coeff_count <= 768) & (tx_height > 16)) return 24;
- return tx_height;
- }
- }
- }
- return tx_height;
-}
-
// The value for index i is derived as:
// round(sqrt(2) * sin(i * pi / 9) * 2 / 3 * (1 << 12)).
constexpr int16_t kAdst4Multiplier[4] = {1321, 2482, 3344, 3803};
diff --git a/libgav1/src/dsp/libgav1_dsp.cmake b/libgav1/src/dsp/libgav1_dsp.cmake
index 00574fa..a28334d 100644
--- a/libgav1/src/dsp/libgav1_dsp.cmake
+++ b/libgav1/src/dsp/libgav1_dsp.cmake
@@ -30,6 +30,7 @@
"${libgav1_source}/dsp/constants.h"
"${libgav1_source}/dsp/convolve.cc"
"${libgav1_source}/dsp/convolve.h"
+ "${libgav1_source}/dsp/convolve.inc"
"${libgav1_source}/dsp/distance_weighted_blend.cc"
"${libgav1_source}/dsp/distance_weighted_blend.h"
"${libgav1_source}/dsp/dsp.cc"
@@ -39,8 +40,16 @@
"${libgav1_source}/dsp/film_grain_common.h"
"${libgav1_source}/dsp/intra_edge.cc"
"${libgav1_source}/dsp/intra_edge.h"
+ "${libgav1_source}/dsp/intrapred_cfl.cc"
+ "${libgav1_source}/dsp/intrapred_cfl.h"
+ "${libgav1_source}/dsp/intrapred_directional.cc"
+ "${libgav1_source}/dsp/intrapred_directional.h"
+ "${libgav1_source}/dsp/intrapred_filter.cc"
+ "${libgav1_source}/dsp/intrapred_filter.h"
"${libgav1_source}/dsp/intrapred.cc"
"${libgav1_source}/dsp/intrapred.h"
+ "${libgav1_source}/dsp/intrapred_smooth.cc"
+ "${libgav1_source}/dsp/intrapred_smooth.h"
"${libgav1_source}/dsp/inverse_transform.cc"
"${libgav1_source}/dsp/inverse_transform.h"
"${libgav1_source}/dsp/inverse_transform.inc"
@@ -64,6 +73,16 @@
"${libgav1_source}/dsp/weight_mask.cc"
"${libgav1_source}/dsp/weight_mask.h")
+list(APPEND libgav1_dsp_sources_avx2
+ ${libgav1_dsp_sources_avx2}
+ "${libgav1_source}/dsp/x86/cdef_avx2.cc"
+ "${libgav1_source}/dsp/x86/cdef_avx2.h"
+ "${libgav1_source}/dsp/x86/convolve_avx2.cc"
+ "${libgav1_source}/dsp/x86/convolve_avx2.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.cc"
+ "${libgav1_source}/dsp/x86/loop_restoration_avx2.h")
+
list(APPEND libgav1_dsp_sources_neon
${libgav1_dsp_sources_neon}
"${libgav1_source}/dsp/arm/average_blend_neon.cc"
@@ -80,11 +99,16 @@
"${libgav1_source}/dsp/arm/intra_edge_neon.cc"
"${libgav1_source}/dsp/arm/intra_edge_neon.h"
"${libgav1_source}/dsp/arm/intrapred_cfl_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_cfl_neon.h"
+ "${libgav1_source}/dsp/arm/intrapred_directional_neon.h"
"${libgav1_source}/dsp/arm/intrapred_directional_neon.cc"
- "${libgav1_source}/dsp/arm/intrapred_filter_intra_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_filter_neon.h"
"${libgav1_source}/dsp/arm/intrapred_neon.cc"
"${libgav1_source}/dsp/arm/intrapred_neon.h"
"${libgav1_source}/dsp/arm/intrapred_smooth_neon.cc"
+ "${libgav1_source}/dsp/arm/intrapred_smooth_neon.h"
+ "${libgav1_source}/dsp/arm/inverse_transform_10bit_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.cc"
"${libgav1_source}/dsp/arm/inverse_transform_neon.h"
"${libgav1_source}/dsp/arm/loop_filter_neon.cc"
@@ -115,18 +139,28 @@
"${libgav1_source}/dsp/x86/cdef_sse4.h"
"${libgav1_source}/dsp/x86/convolve_sse4.cc"
"${libgav1_source}/dsp/x86/convolve_sse4.h"
+ "${libgav1_source}/dsp/x86/convolve_sse4.inc"
"${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.cc"
"${libgav1_source}/dsp/x86/distance_weighted_blend_sse4.h"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.cc"
+ "${libgav1_source}/dsp/x86/film_grain_sse4.h"
"${libgav1_source}/dsp/x86/intra_edge_sse4.cc"
"${libgav1_source}/dsp/x86/intra_edge_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_directional_sse4.h"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_filter_sse4.h"
"${libgav1_source}/dsp/x86/intrapred_sse4.cc"
"${libgav1_source}/dsp/x86/intrapred_sse4.h"
- "${libgav1_source}/dsp/x86/intrapred_cfl_sse4.cc"
"${libgav1_source}/dsp/x86/intrapred_smooth_sse4.cc"
+ "${libgav1_source}/dsp/x86/intrapred_smooth_sse4.h"
"${libgav1_source}/dsp/x86/inverse_transform_sse4.cc"
"${libgav1_source}/dsp/x86/inverse_transform_sse4.h"
"${libgav1_source}/dsp/x86/loop_filter_sse4.cc"
"${libgav1_source}/dsp/x86/loop_filter_sse4.h"
+ "${libgav1_source}/dsp/x86/loop_restoration_10bit_sse4.cc"
"${libgav1_source}/dsp/x86/loop_restoration_sse4.cc"
"${libgav1_source}/dsp/x86/loop_restoration_sse4.h"
"${libgav1_source}/dsp/x86/mask_blend_sse4.cc"
@@ -143,12 +177,13 @@
"${libgav1_source}/dsp/x86/warp_sse4.cc"
"${libgav1_source}/dsp/x86/warp_sse4.h"
"${libgav1_source}/dsp/x86/weight_mask_sse4.cc"
- "${libgav1_source}/dsp/x86/weight_mask_sse4.h"
- )
+ "${libgav1_source}/dsp/x86/weight_mask_sse4.h")
macro(libgav1_add_dsp_targets)
unset(dsp_sources)
- list(APPEND dsp_sources ${libgav1_dsp_sources} ${libgav1_dsp_sources_neon}
+ list(APPEND dsp_sources ${libgav1_dsp_sources}
+ ${libgav1_dsp_sources_neon}
+ ${libgav1_dsp_sources_avx2}
${libgav1_dsp_sources_sse4})
libgav1_add_library(NAME
diff --git a/libgav1/src/dsp/loop_restoration.cc b/libgav1/src/dsp/loop_restoration.cc
index fce54f2..1a15d90 100644
--- a/libgav1/src/dsp/loop_restoration.cc
+++ b/libgav1/src/dsp/loop_restoration.cc
@@ -18,6 +18,7 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
+#include <cstring>
#include "src/dsp/common.h"
#include "src/dsp/dsp.h"
@@ -36,7 +37,7 @@
// else
// a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
// ma = 256 - a2;
-const uint8_t kSgrMaLookup[256] = {
+alignas(16) const uint8_t kSgrMaLookup[256] = {
255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7,
7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -68,8 +69,7 @@
constexpr int offset =
1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
constexpr int limit = (offset << 2) - 1;
- int y = height;
- do {
+ for (int y = 0; y < height; ++y) {
int x = 0;
do {
// sum fits into 16 bits only when bitdepth = 8.
@@ -84,7 +84,7 @@
} while (++x != width);
source += source_stride;
*wiener_buffer += width;
- } while (--y != 0);
+ }
}
template <int bitdepth, typename Pixel>
@@ -143,10 +143,12 @@
// filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
// Thus in libaom's computation, an offset of 128 is needed for filter[3].
template <int bitdepth, typename Pixel>
-void WienerFilter_C(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- ptrdiff_t source_stride, ptrdiff_t dest_stride, int width,
- int height, RestorationBuffer* const restoration_buffer) {
+void WienerFilter_C(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
constexpr int kCenterTap = kWienerFilterTaps / 2;
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
@@ -158,28 +160,51 @@
// horizontal filtering.
const int height_horizontal =
height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
const int16_t* const filter_horizontal =
restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
- const auto* src = static_cast<const Pixel*>(source);
- src -= (kCenterTap - number_rows_to_skip) * source_stride + kCenterTap;
+ const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
+ const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
+ const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
- height_horizontal, filter_horizontal, 0,
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 0, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 0, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 0,
&wiener_buffer);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
- height_horizontal, filter_horizontal, 1,
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 1, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 1, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 1,
&wiener_buffer);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
- WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
- height_horizontal, filter_horizontal, 2,
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 2, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 2,
&wiener_buffer);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontal<bitdepth, Pixel>(src, source_stride, width,
- height_horizontal, filter_horizontal, 3,
+ WienerHorizontal<bitdepth, Pixel>(
+ top + (2 - height_extra) * top_border_stride, top_border_stride, width,
+ height_extra, filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
+ filter_horizontal, 3, &wiener_buffer);
+ WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
+ height_extra, filter_horizontal, 3,
&wiener_buffer);
}
@@ -195,28 +220,29 @@
memcpy(wiener_buffer_org, wiener_buffer_org + width,
sizeof(*wiener_buffer) * width);
WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
- filter_vertical, 0, dest, dest_stride);
+ filter_vertical, 0, dest, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
- filter_vertical, 1, dest, dest_stride);
+ filter_vertical, 1, dest, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
- filter_vertical, 2, dest, dest_stride);
+ filter_vertical, 2, dest, stride);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
- filter_vertical, 3, dest, dest_stride);
+ filter_vertical, 3, dest, stride);
}
}
//------------------------------------------------------------------------------
// SGR
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
template <typename Pixel, int size>
LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
const int height, const int width,
- uint16_t* sums, uint32_t* square_sums,
- const ptrdiff_t sum_stride) {
+ uint16_t* const* sums,
+ uint32_t* const* square_sums) {
int y = height;
do {
uint32_t sum = 0;
@@ -226,8 +252,8 @@
sum += source;
square_sum += source * source;
}
- sums[0] = sum;
- square_sums[0] = square_sum;
+ (*sums)[0] = sum;
+ (*square_sums)[0] = square_sum;
int x = 1;
do {
const Pixel source0 = src[x - 1];
@@ -236,21 +262,22 @@
sum += source1;
square_sum -= source0 * source0;
square_sum += source1 * source1;
- sums[x] = sum;
- square_sums[x] = square_sum;
+ (*sums)[x] = sum;
+ (*square_sums)[x] = square_sum;
} while (++x != width);
src += src_stride;
- sums += sum_stride;
- square_sums += sum_stride;
+ ++sums;
+ ++square_sums;
} while (--y != 0);
}
+// When |height| is 1, |src_stride| could be set to an arbitrary value.
template <typename Pixel>
LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
const int height, const int width,
- uint16_t* sum3, uint16_t* sum5,
- uint32_t* square_sum3, uint32_t* square_sum5,
- const ptrdiff_t sum_stride) {
+ uint16_t* const* sum3, uint16_t* const* sum5,
+ uint32_t* const* square_sum3,
+ uint32_t* const* square_sum5) {
int y = height;
do {
uint32_t sum = 0;
@@ -266,18 +293,18 @@
const Pixel source1 = src[x + 4];
sum -= source0;
square_sum -= source0 * source0;
- sum3[x] = sum;
- square_sum3[x] = square_sum;
+ (*sum3)[x] = sum;
+ (*square_sum3)[x] = square_sum;
sum += source1;
square_sum += source1 * source1;
- sum5[x] = sum + source0;
- square_sum5[x] = square_sum + source0 * source0;
+ (*sum5)[x] = sum + source0;
+ (*square_sum5)[x] = square_sum + source0 * source0;
} while (++x != width);
src += src_stride;
- sum3 += sum_stride;
- sum5 += sum_stride;
- square_sum3 += sum_stride;
- square_sum5 += sum_stride;
+ ++sum3;
+ ++sum5;
+ ++square_sum3;
+ ++square_sum5;
} while (--y != 0);
}
@@ -396,20 +423,20 @@
}
template <typename Pixel>
-inline void BoxFilterPass(const Pixel src0, const Pixel src1,
- const uint16_t* const ma565[2],
- const uint32_t* const b565[2], const ptrdiff_t x,
- int p[2]) {
+inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
+ const uint16_t* const ma565[2],
+ const uint32_t* const b565[2],
+ const ptrdiff_t x, int p[2]) {
p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
b565[0][x] + b565[1][x], 5);
p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
}
template <typename Pixel>
-inline int BoxFilterPass2(const Pixel src, const uint16_t* const ma343[3],
- const uint16_t* const ma444,
- const uint32_t* const b343[3],
- const uint32_t* const b444, const ptrdiff_t x) {
+inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
+ const uint16_t* const ma444,
+ const uint32_t* const b343[3],
+ const uint32_t* const b444, const ptrdiff_t x) {
const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
@@ -441,37 +468,90 @@
return SelfGuidedFinal<bitdepth, Pixel>(src, v);
}
-template <typename T>
-void Circulate3PointersBy1(T* p[3]) {
- T* const p0 = p[0];
- p[0] = p[1];
- p[1] = p[2];
- p[2] = p0;
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width,
+ const uint32_t scale, const int16_t w0,
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma565[2], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
+ ma565[1], b565[1]);
+ int x = 0;
+ do {
+ int p[2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
+ dst[stride + x] =
+ SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
+ } while (++x != width);
}
-template <typename T>
-void Circulate4PointersBy2(T* p[4]) {
- std::swap(p[0], p[2]);
- std::swap(p[1], p[3]);
+template <int bitdepth, typename Pixel>
+inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
+ const int width, const uint16_t scale,
+ const int16_t w0, uint16_t* const sum3[4],
+ uint32_t* const square_sum3[4],
+ SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint32_t* const b343[4], uint32_t* const b444[3],
+ Pixel* dst) {
+ BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ int x = 0;
+ do {
+ const int p =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
+ } while (++x != width);
}
-template <typename T>
-void Circulate5PointersBy2(T* p[5]) {
- T* const p0 = p[0];
- T* const p1 = p[1];
- p[0] = p[2];
- p[1] = p[3];
- p[2] = p[4];
- p[3] = p0;
- p[4] = p1;
+template <int bitdepth, typename Pixel>
+inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], const int width,
+ const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, SgrBuffer* const sgr_buffer,
+ uint16_t* const ma343[4], uint16_t* const ma444[3],
+ uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2],
+ Pixel* dst) {
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
+ sgr_buffer, ma565[1], b565[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
+ sgr_buffer, ma343[2], b343[2], ma444[1],
+ b444[1]);
+ BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
+ true, sgr_buffer, ma343[3], b343[3], ma444[2],
+ b444[2]);
+ int x = 0;
+ do {
+ int p[2][2];
+ BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
+ p[1][0] =
+ BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
+ b343 + 1, b444[1], x);
+ dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
+ p[1][0], w0, w2);
+ dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
+ src[stride + x], p[0][1], p[1][1], w0, w2);
+ } while (++x != width);
}
template <int bitdepth, typename Pixel>
inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
- const Pixel* src, const ptrdiff_t src_stride,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
const int width, const int height,
- SgrBuffer* const sgr_buffer, Pixel* dst,
- const ptrdiff_t dst_stride) {
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 8);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
@@ -509,10 +589,15 @@
b565[1] = b565[0] + temp_stride;
assert(scales[0] != 0);
assert(scales[1] != 0);
- BoxSum<Pixel>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum3[0],
- sum5[1], square_sum3[0], square_sum5[1], sum_stride);
- memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride);
- memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride);
+ BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
+ square_sum3, square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
+ square_sum5 + 4);
BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
sgr_buffer, ma565[0], b565[0]);
BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
@@ -521,38 +606,21 @@
BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
true, sgr_buffer, ma343[1], b343[1], ma444[0],
b444[0]);
- for (int y = height >> 1; y != 0; --y) {
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
Circulate4PointersBy2<uint16_t>(sum3);
Circulate4PointersBy2<uint32_t>(square_sum3);
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2],
- sum5[3], square_sum3[2], square_sum5[3], sum_stride);
- BoxSum<Pixel>(src + 3 * src_stride - 3, src_stride, 1, width + 2, sum3[3],
- sum5[4], square_sum3[3], square_sum5[4], sum_stride);
- BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
- sgr_buffer, ma565[1], b565[1]);
- BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
- sgr_buffer, ma343[2], b343[2], ma444[1],
- b444[1]);
- BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
- true, sgr_buffer, ma343[3], b343[3],
- ma444[2], b444[2]);
- int x = 0;
- do {
- int p[2][2];
- BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p[0]);
- p[1][0] =
- BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
- p[1][1] = BoxFilterPass2<Pixel>(src[src_stride + x], ma343 + 1, ma444[1],
- b343 + 1, b444[1], x);
- dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
- p[1][0], w0, w2);
- dst[dst_stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
- src[src_stride + x], p[0][1], p[1][1], w0, w2);
- } while (++x != width);
- src += 2 * src_stride;
- dst += 2 * dst_stride;
+ BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
Circulate4PointersBy2<uint16_t>(ma343);
Circulate4PointersBy2<uint32_t>(b343);
std::swap(ma444[0], ma444[2]);
@@ -560,15 +628,48 @@
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = bottom_border_stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
+ square_sum3 + 2, square_sum5 + 3);
+ BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
+ square_sum5, width, scales, w0, w2, sgr_buffer,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ }
if ((height & 1) != 0) {
- Circulate4PointersBy2<uint16_t>(sum3);
- Circulate4PointersBy2<uint32_t>(square_sum3);
- Circulate5PointersBy2<uint16_t>(sum5);
- Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxSum<Pixel>(src + 2 * src_stride - 3, src_stride, 1, width + 2, sum3[2],
- sum5[3], square_sum3[2], square_sum5[3], sum_stride);
- memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride);
- memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride);
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
+ width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
+ square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
sgr_buffer, ma565[1], b565[1]);
BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
@@ -578,8 +679,8 @@
do {
const int p0 = CalculateFilteredOutput<Pixel>(
src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
- const int p1 =
- BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
+ const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
+ b444[0], x);
dst[x] =
SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
} while (++x != width);
@@ -588,14 +689,17 @@
template <int bitdepth, typename Pixel>
inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
- const Pixel* src, const ptrdiff_t src_stride,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
const int width, const int height,
- SgrBuffer* const sgr_buffer, Pixel* dst,
- const ptrdiff_t dst_stride) {
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
const auto temp_stride = Align<ptrdiff_t>(width, 8);
const ptrdiff_t sum_stride = temp_stride + 8;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
- const uint32_t s = kSgrScaleParameter[sgr_proj_index][0]; // s < 2^12.
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
uint16_t *sum5[5], *ma565[2];
uint32_t *square_sum5[5], *b565[2];
@@ -609,43 +713,63 @@
ma565[1] = ma565[0] + temp_stride;
b565[0] = sgr_buffer->b565;
b565[1] = b565[0] + temp_stride;
- assert(s != 0);
- BoxSum<Pixel, 5>(src - 2 * src_stride - 3, src_stride, 4, width + 2, sum5[1],
- square_sum5[1], sum_stride);
- memcpy(sum5[0], sum5[1], sizeof(**sum5) * sum_stride);
- memcpy(square_sum5[0], square_sum5[1], sizeof(**square_sum5) * sum_stride);
- BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
+ assert(scale != 0);
+ BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
+ square_sum5 + 1);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
+ const Pixel* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
ma565[0], b565[0]);
- for (int y = height >> 1; y != 0; --y) {
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
Circulate5PointersBy2<uint16_t>(sum5);
Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2,
- sum5[3], square_sum5[3], sum_stride);
- BoxSum<Pixel, 5>(src + 3 * src_stride - 3, src_stride, 1, width + 2,
- sum5[4], square_sum5[4], sum_stride);
- BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
- ma565[1], b565[1]);
- int x = 0;
- do {
- int p[2];
- BoxFilterPass<Pixel>(src[x], src[src_stride + x], ma565, b565, x, p);
- dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
- dst[dst_stride + x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(
- src[src_stride + x], p[1], w0);
- } while (++x != width);
- src += 2 * src_stride;
- dst += 2 * dst_stride;
+ BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
+ square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
std::swap(ma565[0], ma565[1]);
std::swap(b565[0], b565[1]);
}
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const Pixel* sr;
+ ptrdiff_t s_stride;
+ if ((height & 1) == 0) {
+ sr = bottom_border;
+ s_stride = bottom_border_stride;
+ } else {
+ sr = src + 2 * stride;
+ s_stride = bottom_border - (src + 2 * stride);
+ }
+ BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
+ BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
+ scale, w0, sgr_buffer, ma565, b565, dst);
+ }
if ((height & 1) != 0) {
- Circulate5PointersBy2<uint16_t>(sum5);
- Circulate5PointersBy2<uint32_t>(square_sum5);
- BoxSum<Pixel, 5>(src + 2 * src_stride - 3, src_stride, 1, width + 2,
- sum5[3], square_sum5[3], sum_stride);
- memcpy(sum5[4], sum5[3], sizeof(**sum5) * sum_stride);
- memcpy(square_sum5[4], square_sum5[3], sizeof(**square_sum5) * sum_stride);
- BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, s, sgr_buffer,
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
+ 1, width + 2, sum5 + 3, square_sum5 + 3);
+ sum5[4] = sum5[3];
+ square_sum5[4] = square_sum5[3];
+ BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
ma565[1], b565[1]);
int x = 0;
do {
@@ -658,17 +782,20 @@
template <int bitdepth, typename Pixel>
inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
- const Pixel* src, const ptrdiff_t src_stride,
+ const Pixel* src, const ptrdiff_t stride,
+ const Pixel* const top_border,
+ const ptrdiff_t top_border_stride,
+ const Pixel* bottom_border,
+ const ptrdiff_t bottom_border_stride,
const int width, const int height,
- SgrBuffer* const sgr_buffer, Pixel* dst,
- const ptrdiff_t dst_stride) {
+ SgrBuffer* const sgr_buffer, Pixel* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
const auto temp_stride = Align<ptrdiff_t>(width, 8);
const ptrdiff_t sum_stride = temp_stride + 8;
const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
const int sgr_proj_index = restoration_info.sgr_proj_info.index;
- const uint32_t s = kSgrScaleParameter[sgr_proj_index][1]; // s < 2^12.
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
uint16_t *sum3[3], *ma343[3], *ma444[2];
uint32_t *square_sum3[3], *b343[3], *b444[2];
sum3[0] = sgr_buffer->sum3;
@@ -685,34 +812,52 @@
ma444[1] = ma444[0] + temp_stride;
b444[0] = sgr_buffer->b444;
b444[1] = b444[0] + temp_stride;
- assert(s != 0);
- BoxSum<Pixel, 3>(src - 2 * src_stride - 2, src_stride, 3, width + 2, sum3[0],
- square_sum3[0], sum_stride);
- BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, false, sgr_buffer,
- ma343[0], b343[0], nullptr, nullptr);
+ assert(scale != 0);
+ BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
+ square_sum3);
+ BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
+ sgr_buffer, ma343[0], b343[0], nullptr,
+ nullptr);
Circulate3PointersBy1<uint16_t>(sum3);
Circulate3PointersBy1<uint32_t>(square_sum3);
- BoxSum<Pixel, 3>(src + src_stride - 2, src_stride, 1, width + 2, sum3[2],
- square_sum3[2], sum_stride);
- BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true, sgr_buffer,
- ma343[1], b343[1], ma444[0], b444[0]);
- int y = height;
+ const Pixel* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
+ BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
+ sgr_buffer, ma343[1], b343[1], ma444[0],
+ b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
+ sum3, square_sum3, sgr_buffer, ma343, ma444,
+ b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ src += 2;
+ int y = std::min(height, 2);
do {
Circulate3PointersBy1<uint16_t>(sum3);
Circulate3PointersBy1<uint32_t>(square_sum3);
- BoxSum<Pixel, 3>(src + 2 * src_stride - 2, src_stride, 1, width + 2,
- sum3[2], square_sum3[2], sum_stride);
- BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, s, true,
- sgr_buffer, ma343[2], b343[2], ma444[1],
- b444[1]);
- int x = 0;
- do {
- const int p =
- BoxFilterPass2<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
- dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
- } while (++x != width);
- src += src_stride;
- dst += dst_stride;
+ BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
+ square_sum3, sgr_buffer, ma343, ma444, b343,
+ b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
Circulate3PointersBy1<uint16_t>(ma343);
Circulate3PointersBy1<uint32_t>(b343);
std::swap(ma444[0], ma444[1]);
@@ -721,32 +866,35 @@
}
template <int bitdepth, typename Pixel>
-void SelfGuidedFilter_C(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- ptrdiff_t source_stride, ptrdiff_t dest_stride,
- int width, int height,
- RestorationBuffer* const restoration_buffer) {
+void SelfGuidedFilter_C(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
const auto* src = static_cast<const Pixel*>(source);
+ const auto* top = static_cast<const Pixel*>(top_border);
+ const auto* bottom = static_cast<const Pixel*>(bottom_border);
auto* dst = static_cast<Pixel*>(dest);
SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
if (radius_pass_1 == 0) {
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1<bitdepth, Pixel>(restoration_info, src, source_stride,
- width, height, sgr_buffer, dst,
- dest_stride);
+ BoxFilterProcessPass1<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2<bitdepth, Pixel>(restoration_info, src, source_stride,
- width, height, sgr_buffer, dst,
- dest_stride);
+ BoxFilterProcessPass2<bitdepth, Pixel>(
+ restoration_info, src - 2, stride, top - 2, top_border_stride,
+ bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess<bitdepth, Pixel>(restoration_info, src, source_stride,
- width, height, sgr_buffer, dst,
- dest_stride);
+ BoxFilterProcess<bitdepth, Pixel>(
+ restoration_info, src - 3, stride, top - 3, top_border_stride,
+ bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
}
}
diff --git a/libgav1/src/dsp/loop_restoration.h b/libgav1/src/dsp/loop_restoration.h
index a902e9b..de80926 100644
--- a/libgav1/src/dsp/loop_restoration.h
+++ b/libgav1/src/dsp/loop_restoration.h
@@ -30,6 +30,7 @@
// The order of includes is important as each tests for a superior version
// before setting the base.
// clang-format off
+#include "src/dsp/x86/loop_restoration_avx2.h"
#include "src/dsp/x86/loop_restoration_sse4.h"
// clang-format on
@@ -53,6 +54,31 @@
// Initializes Dsp::loop_restorations. This function is not thread-safe.
void LoopRestorationInit_C();
+template <typename T>
+void Circulate3PointersBy1(T* p[3]) {
+ T* const p0 = p[0];
+ p[0] = p[1];
+ p[1] = p[2];
+ p[2] = p0;
+}
+
+template <typename T>
+void Circulate4PointersBy2(T* p[4]) {
+ std::swap(p[0], p[2]);
+ std::swap(p[1], p[3]);
+}
+
+template <typename T>
+void Circulate5PointersBy2(T* p[5]) {
+ T* const p0 = p[0];
+ T* const p1 = p[1];
+ p[0] = p[2];
+ p[1] = p[3];
+ p[2] = p[4];
+ p[3] = p0;
+ p[4] = p1;
+}
+
} // namespace dsp
} // namespace libgav1
diff --git a/libgav1/src/dsp/mask_blend.cc b/libgav1/src/dsp/mask_blend.cc
index 101c410..15ef821 100644
--- a/libgav1/src/dsp/mask_blend.cc
+++ b/libgav1/src/dsp/mask_blend.cc
@@ -25,8 +25,8 @@
namespace dsp {
namespace {
-template <int subsampling_x, int subsampling_y>
-uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x) {
+uint8_t GetMaskValue(const uint8_t* mask, const uint8_t* mask_next_row, int x,
+ int subsampling_x, int subsampling_y) {
if ((subsampling_x | subsampling_y) == 0) {
return mask[x];
}
@@ -63,7 +63,7 @@
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
const uint8_t mask_value =
- GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
if (is_inter_intra) {
dst[x] = static_cast<Pixel>(RightShiftWithRounding(
mask_value * pred_1[x] + (64 - mask_value) * pred_0[x], 6));
@@ -96,7 +96,7 @@
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
const uint8_t mask_value =
- GetMaskValue<subsampling_x, subsampling_y>(mask, mask_next_row, x);
+ GetMaskValue(mask, mask_next_row, x, subsampling_x, subsampling_y);
prediction_1[x] = static_cast<uint8_t>(RightShiftWithRounding(
mask_value * prediction_1[x] + (64 - mask_value) * prediction_0[x],
6));
@@ -148,6 +148,7 @@
#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
dsp->inter_intra_mask_blend_8bpp[2] = InterIntraMaskBlend8bpp_C<1, 1>;
#endif
+ static_cast<void>(GetMaskValue);
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
diff --git a/libgav1/src/dsp/super_res.cc b/libgav1/src/dsp/super_res.cc
index 9379f46..abb01a1 100644
--- a/libgav1/src/dsp/super_res.cc
+++ b/libgav1/src/dsp/super_res.cc
@@ -25,47 +25,57 @@
namespace {
template <int bitdepth, typename Pixel>
-void ComputeSuperRes(const void* source, const int upscaled_width,
- const int initial_subpixel_x, const int step,
- void* const dest) {
- // If (original) upscaled_width is <= 9, the downscaled_width may be
- // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
- // subsampled via RightShiftWithRounding. This leads to an edge case where
- // |step| == 1 << 14.
- assert(step <= kSuperResScaleMask || upscaled_width <= 4);
- const auto* src = static_cast<const Pixel*>(source);
+void SuperRes_C(const void* /*coefficients*/, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step, void* const dest,
+ ptrdiff_t dest_stride) {
+ assert(step <= 1 << kSuperResScaleBits);
+ auto* src = static_cast<Pixel*>(source) - DivideBy2(kSuperResFilterTaps);
auto* dst = static_cast<Pixel*>(dest);
- src -= DivideBy2(kSuperResFilterTaps);
- int subpixel_x = initial_subpixel_x;
- for (int x = 0; x < upscaled_width; ++x) {
- int sum = 0;
- const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
- const int src_x_subpixel =
- (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
- // The sign of each tap is: - + - + + - + -
- sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
- sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
- sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
- sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
- sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
- sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
- sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
- sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
- dst[x] =
- Clip3(RightShiftWithRounding(sum, kFilterBits), 0, (1 << bitdepth) - 1);
- subpixel_x += step;
- }
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ // If (original) upscaled_width is <= 9, the downscaled_width may be
+ // upscaled_width - 1 (i.e. 8, 9), and become the same (i.e. 4) when
+ // subsampled via RightShiftWithRounding. This leads to an edge case where
+ // |step| == 1 << 14.
+ int subpixel_x = initial_subpixel_x;
+ int x = 0;
+ do {
+ int sum = 0;
+ const Pixel* const src_x = &src[subpixel_x >> kSuperResScaleBits];
+ const int src_x_subpixel =
+ (subpixel_x & kSuperResScaleMask) >> kSuperResExtraBits;
+ // The sign of each tap is: - + - + + - + -
+ sum -= src_x[0] * kUpscaleFilterUnsigned[src_x_subpixel][0];
+ sum += src_x[1] * kUpscaleFilterUnsigned[src_x_subpixel][1];
+ sum -= src_x[2] * kUpscaleFilterUnsigned[src_x_subpixel][2];
+ sum += src_x[3] * kUpscaleFilterUnsigned[src_x_subpixel][3];
+ sum += src_x[4] * kUpscaleFilterUnsigned[src_x_subpixel][4];
+ sum -= src_x[5] * kUpscaleFilterUnsigned[src_x_subpixel][5];
+ sum += src_x[6] * kUpscaleFilterUnsigned[src_x_subpixel][6];
+ sum -= src_x[7] * kUpscaleFilterUnsigned[src_x_subpixel][7];
+ dst[x] = Clip3(RightShiftWithRounding(sum, kFilterBits), 0,
+ (1 << bitdepth) - 1);
+ subpixel_x += step;
+ } while (++x < upscaled_width);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
}
void Init8bpp() {
Dsp* dsp = dsp_internal::GetWritableDspTable(8);
assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- dsp->super_res_row = ComputeSuperRes<8, uint8_t>;
+ dsp->super_res = SuperRes_C<8, uint8_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp8bpp_SuperRes
- dsp->super_res_row = ComputeSuperRes<8, uint8_t>;
+ dsp->super_res = SuperRes_C<8, uint8_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
@@ -74,12 +84,13 @@
void Init10bpp() {
Dsp* dsp = dsp_internal::GetWritableDspTable(10);
assert(dsp != nullptr);
+ dsp->super_res_coefficients = nullptr;
#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- dsp->super_res_row = ComputeSuperRes<10, uint16_t>;
+ dsp->super_res = SuperRes_C<10, uint16_t>;
#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
static_cast<void>(dsp);
#ifndef LIBGAV1_Dsp10bpp_SuperRes
- dsp->super_res_row = ComputeSuperRes<10, uint16_t>;
+ dsp->super_res = SuperRes_C<10, uint16_t>;
#endif
#endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
}
diff --git a/libgav1/src/dsp/super_res.h b/libgav1/src/dsp/super_res.h
index cd69474..2ca9d2b 100644
--- a/libgav1/src/dsp/super_res.h
+++ b/libgav1/src/dsp/super_res.h
@@ -38,7 +38,7 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::super_res_row. This function is not thread-safe.
+// Initializes Dsp::super_res. This function is not thread-safe.
void SuperResInit_C();
} // namespace dsp
diff --git a/libgav1/src/dsp/x86/average_blend_sse4.cc b/libgav1/src/dsp/x86/average_blend_sse4.cc
index 6c37658..ec9f589 100644
--- a/libgav1/src/dsp/x86/average_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/average_blend_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/average_blend.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
@@ -30,6 +30,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
@@ -138,13 +139,232 @@
}
} // namespace
+} // namespace low_bitdepth
-void AverageBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kInterPostRoundBitPlusOne = 5;
+
+template <const int width, const int offset>
+inline void AverageBlendRow(const uint16_t* prediction_0,
+ const uint16_t* prediction_1,
+ const __m128i& compound_offset,
+ const __m128i& round_offset, const __m128i& max,
+ const __m128i& zero, uint16_t* dst,
+ const ptrdiff_t dest_stride) {
+ // pred_0/1 max range is 16b.
+ const __m128i pred_0 = LoadUnaligned16(prediction_0 + offset);
+ const __m128i pred_1 = LoadUnaligned16(prediction_1 + offset);
+ const __m128i pred_00 = _mm_cvtepu16_epi32(pred_0);
+ const __m128i pred_01 = _mm_unpackhi_epi16(pred_0, zero);
+ const __m128i pred_10 = _mm_cvtepu16_epi32(pred_1);
+ const __m128i pred_11 = _mm_unpackhi_epi16(pred_1, zero);
+
+ const __m128i pred_add_0 = _mm_add_epi32(pred_00, pred_10);
+ const __m128i pred_add_1 = _mm_add_epi32(pred_01, pred_11);
+ const __m128i compound_offset_0 = _mm_sub_epi32(pred_add_0, compound_offset);
+ const __m128i compound_offset_1 = _mm_sub_epi32(pred_add_1, compound_offset);
+ // RightShiftWithRounding and Clip3.
+ const __m128i round_0 = _mm_add_epi32(compound_offset_0, round_offset);
+ const __m128i round_1 = _mm_add_epi32(compound_offset_1, round_offset);
+ const __m128i res_0 = _mm_srai_epi32(round_0, kInterPostRoundBitPlusOne);
+ const __m128i res_1 = _mm_srai_epi32(round_1, kInterPostRoundBitPlusOne);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(res_0, res_1), max);
+ if (width != 4) {
+ // Store width=8/16/32/64/128.
+ StoreUnaligned16(dst + offset, result);
+ return;
+ }
+ assert(width == 4);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dest_stride, result);
+}
+
+void AverageBlend10bpp_SSE4_1(const void* prediction_0,
+ const void* prediction_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dst_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dest_stride = dst_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const __m128i compound_offset =
+ _mm_set1_epi32(kCompoundOffset + kCompoundOffset);
+ const __m128i round_offset =
+ _mm_set1_epi32((1 << kInterPostRoundBitPlusOne) >> 1);
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ int y = height;
+
+ if (width == 4) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0,1
+ AverageBlendRow<4, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 8) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 16) {
+ const ptrdiff_t dest_stride2 = dest_stride << 1;
+ const ptrdiff_t width2 = width << 1;
+ do {
+ // row0.
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // row1.
+ AverageBlendRow<8, 0>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ AverageBlendRow<8, 8>(pred_0 + width, pred_1 + width, compound_offset,
+ round_offset, max, zero, dst + dest_stride,
+ dest_stride);
+ dst += dest_stride2;
+ pred_0 += width2;
+ pred_1 += width2;
+ y -= 2;
+ } while (y != 0);
+ return;
+ }
+ if (width == 32) {
+ do {
+ // pred [0 - 15].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [16 - 31].
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ if (width == 64) {
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+ return;
+ }
+ assert(width == 128);
+ do {
+ // pred [0 - 31].
+ AverageBlendRow<8, 0>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 8>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 16>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 24>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [31 - 63].
+ AverageBlendRow<8, 32>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 40>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 48>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 56>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+
+ // pred [64 - 95].
+ AverageBlendRow<8, 64>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 72>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 80>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 88>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ // pred [96 - 127].
+ AverageBlendRow<8, 96>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 104>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 112>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ AverageBlendRow<8, 120>(pred_0, pred_1, compound_offset, round_offset, max,
+ zero, dst, dest_stride);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(AverageBlend)
+ dsp->average_blend = AverageBlend10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void AverageBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -153,4 +373,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/average_blend_sse4.h b/libgav1/src/dsp/x86/average_blend_sse4.h
index e205c2b..cd07112 100644
--- a/libgav1/src/dsp/x86/average_blend_sse4.h
+++ b/libgav1/src/dsp/x86/average_blend_sse4.h
@@ -31,11 +31,15 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
#ifndef LIBGAV1_Dsp8bpp_AverageBlend
#define LIBGAV1_Dsp8bpp_AverageBlend LIBGAV1_CPU_SSE4_1
#endif
+#ifndef LIBGAV1_Dsp10bpp_AverageBlend
+#define LIBGAV1_Dsp10bpp_AverageBlend LIBGAV1_CPU_SSE4_1
+#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_AVERAGE_BLEND_SSE4_H_
diff --git a/libgav1/src/dsp/x86/cdef_avx2.cc b/libgav1/src/dsp/x86/cdef_avx2.cc
new file mode 100644
index 0000000..d41dc38
--- /dev/null
+++ b/libgav1/src/dsp/x86/cdef_avx2.cc
@@ -0,0 +1,784 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/cdef.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/cdef.inc"
+
+// Used when calculating odd |cost[x]| values.
+// Holds elements 1 3 5 7 7 7 7 7
+alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
+ 420, 210, 140, 105, 420, 210, 140, 105,
+ 105, 105, 105, 105, 105, 105, 105, 105};
+
+// ----------------------------------------------------------------------------
+// Refer to CdefDirection_C().
+//
+// int32_t partial[8][15] = {};
+// for (int i = 0; i < 8; ++i) {
+// for (int j = 0; j < 8; ++j) {
+// const int x = 1;
+// partial[0][i + j] += x;
+// partial[1][i + j / 2] += x;
+// partial[2][i] += x;
+// partial[3][3 + i - j / 2] += x;
+// partial[4][7 + i - j] += x;
+// partial[5][3 - i / 2 + j] += x;
+// partial[6][j] += x;
+// partial[7][i / 2 + j] += x;
+// }
+// }
+//
+// Using the code above, generate the position count for partial[8][15].
+//
+// partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
+// partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+// partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
+// partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
+//
+// The SIMD code shifts the input horizontally, then adds vertically to get the
+// correct partial value for the given position.
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// partial[0][i + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
+// 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
+// 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
+// 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
+// 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
+// 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
+// 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
+//
+// partial[4] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ // 00 01 02 03 04 05 06 07
+ *partial_lo = v_src_16[0];
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 10 11 12 13 14 15 16
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
+ // 17 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
+
+ // 00 00 20 21 22 23 24 25
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
+ // 26 27 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
+
+ // 00 00 00 30 31 32 33 34
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
+ // 35 36 37 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
+
+ // 00 00 00 00 40 41 42 43
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
+ // 44 45 46 47 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
+
+ // 00 00 00 00 00 50 51 52
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
+ // 53 54 55 56 57 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
+
+ // 00 00 00 00 00 00 60 61
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
+ // 62 63 64 65 66 67 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
+
+ // 00 00 00 00 00 00 00 70
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
+ // 71 72 73 74 75 76 77 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[1][i + j / 2] += x;
+//
+// A0 = src[0] + src[1], A1 = src[2] + src[3], ...
+//
+// A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
+// 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
+// 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
+// 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
+// 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
+// 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
+// 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
+// 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
+//
+// partial[3] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
+ __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_d1_temp[8];
+ const __m256i v_zero = _mm256_setzero_si256();
+
+ for (int i = 0; i < 8; ++i) {
+ v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
+ }
+
+ *partial_lo = *partial_hi = v_zero;
+ // A0 A1 A2 A3 00 00 00 00
+ *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
+
+ // 00 B0 B1 B2 B3 00 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
+
+ // 00 00 C0 C1 C2 C3 00 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
+ // 00 00 00 D0 D1 D2 D3 00
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
+ // 00 00 00 00 E0 E1 E2 E3
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
+
+ // 00 00 00 00 00 F0 F1 F2
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
+ // F3 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
+
+ // 00 00 00 00 00 00 G0 G1
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
+ // G2 G3 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
+
+ // 00 00 00 00 00 00 00 H0
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
+ // H1 H2 H3 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
+}
+
+// ----------------------------------------------------------------------------
+// partial[7][i / 2 + j] += x;
+//
+// 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
+// 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
+// 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
+// 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
+// 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
+// 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
+// 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
+// 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
+//
+// partial[5] is the same except the source is reversed.
+LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
+ __m256i* partial_hi) {
+ __m256i v_pair_add[4];
+ // Add vertical source pairs.
+ v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
+ v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
+ v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
+ v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
+
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ *partial_lo = v_pair_add[0];
+ // 00 00 00 00 00 00 00 00
+ // 00 00 00 00 00 00 00 00
+ *partial_hi = _mm256_setzero_si256();
+
+ // 00 20 21 22 23 24 25 26
+ // 00 30 31 32 33 34 35 36
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
+ // 27 00 00 00 00 00 00 00
+ // 37 00 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
+
+ // 00 00 40 41 42 43 44 45
+ // 00 00 50 51 52 53 54 55
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
+ // 46 47 00 00 00 00 00 00
+ // 56 57 00 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
+
+ // 00 00 00 60 61 62 63 64
+ // 00 00 00 70 71 72 73 74
+ *partial_lo =
+ _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
+ // 65 66 67 00 00 00 00 00
+ // 75 76 77 00 00 00 00 00
+ *partial_hi =
+ _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
+}
+
+LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* src, ptrdiff_t stride,
+ __m256i* partial) {
+ // 8x8 input
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ __m256i v_src[8];
+ for (auto& i : v_src) {
+ i = _mm256_castsi128_si256(LoadLo8(src));
+ // Dup lower lane.
+ i = _mm256_permute2x128_si256(i, i, 0x0);
+ src += stride;
+ }
+
+ const __m256i v_zero = _mm256_setzero_si256();
+ // partial for direction 2
+ // --------------------------------------------------------------------------
+ // partial[2][i] += x;
+ // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
+ // 01 11 21 33 41 51 61 71 xx xx xx xx xx xx xx xx
+ // 02 12 22 33 42 52 62 72 xx xx xx xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ // 06 16 26 36 46 56 66 76 xx xx xx xx xx xx xx xx
+ // 07 17 27 37 47 57 67 77 xx xx xx xx xx xx xx xx
+ const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
+ const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
+ const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
+ const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
+ const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
+ const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
+ const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
+ const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
+ const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
+ const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
+ const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
+ partial[2] =
+ _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
+ _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
+
+ const __m256i extend_reverse = SetrM128i(
+ _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
+ static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
+ _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
+ static_cast<int>(0x80048005),
+ static_cast<int>(0x80068007)));
+
+ for (auto& i : v_src) {
+ // Zero extend unsigned 8 to 16. The upper lane is reversed.
+ i = _mm256_shuffle_epi8(i, extend_reverse);
+ }
+
+ // partial for direction 6
+ // --------------------------------------------------------------------------
+ // partial[6][j] += x;
+ // 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // 40 41 42 43 44 45 46 47 xx xx xx xx xx xx xx xx
+ // 50 51 52 53 54 55 56 57 xx xx xx xx xx xx xx xx
+ // 60 61 62 63 64 65 66 67 xx xx xx xx xx xx xx xx
+ // 70 71 72 73 74 75 76 77 xx xx xx xx xx xx xx xx
+ partial[6] = v_src[0];
+ for (int i = 1; i < 8; ++i) {
+ partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
+ }
+
+ AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
+ AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
+ AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
+}
+
+inline __m256i SumVectorPair_S32(__m256i a) {
+ a = _mm256_hadd_epi32(a, a);
+ a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
+ return a;
+}
+
+// |cost[0]| and |cost[4]| square the input and sum with the corresponding
+// element from the other end of the vector:
+// |kCdefDivisionTable[]| element:
+// cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
+// kCdefDivisionTable[i + 1];
+// cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
+inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
+ const __m256i partial_4,
+ const __m256i division_table) {
+ const __m256i division_table_0 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x0);
+ const __m256i division_table_1 =
+ _mm256_permute2x128_si256(division_table, division_table, 0x11);
+
+ // partial_lo
+ const __m256i a = partial_0;
+ // partial_hi
+ const __m256i b = partial_4;
+
+ // Reverse and clear upper 2 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
+ static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
+
+ // 14 13 12 11 10 09 08 ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 14 01 13 02 12 03 11
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 10 05 09 06 08 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[0] = _mm_cvtsi128_si32(sums);
+ cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+template <int index_a, int index_b>
+inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table[2]) {
+ // partial_lo
+ const __m256i a = partial_a;
+ // partial_hi
+ const __m256i b = partial_b;
+
+ // Reverse and clear upper 10 bytes.
+ const __m256i reverser = _mm256_broadcastsi128_si256(
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504));
+
+ // 10 09 08 ZZ ZZ ZZ ZZ ZZ
+ const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
+ // 00 10 01 09 02 08 03 ZZ
+ const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
+ // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
+ const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
+
+ // Square(partial[0][i]) + Square(partial[0][14 - i])
+ const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
+ const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
+
+ const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
+ const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
+ const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
+ cost[index_a] = _mm_cvtsi128_si32(sums);
+ cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
+ const __m256i partial_b,
+ const __m256i division_table) {
+ // The upper lane is a "don't care", so only use the lower lane for
+ // calculating cost.
+ const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
+
+ const __m256i square_a = _mm256_madd_epi16(a, a);
+ const __m256i b = _mm256_mullo_epi32(square_a, division_table);
+ const __m256i c = SumVectorPair_S32(b);
+ // Copy upper 32bit sum to lower lane.
+ const __m128i sums =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
+ cost[2] = _mm_cvtsi128_si32(sums);
+ cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
+}
+
+void CdefDirection_AVX2(const void* const source, ptrdiff_t stride,
+ uint8_t* const direction, int* const variance) {
+ assert(direction != nullptr);
+ assert(variance != nullptr);
+ const auto* src = static_cast<const uint8_t*>(source);
+ uint32_t cost[8];
+
+ // partial[0] = add partial 0,4 low
+ // partial[1] = add partial 1,3 low
+ // partial[2] = add partial 2 low
+ // partial[3] = add partial 1,3 high
+ // partial[4] = add partial 0,4 high
+ // partial[5] = add partial 7,5 high
+ // partial[6] = add partial 6 low
+ // partial[7] = add partial 7,5 low
+ __m256i partial[8];
+
+ AddPartial(src, stride, partial);
+
+ const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
+ const __m256i division_table_7 =
+ _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
+
+ Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
+
+ Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
+
+ const __m256i division_table_odd[2] = {
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
+ LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
+
+ CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
+ CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
+
+ uint32_t best_cost = 0;
+ *direction = 0;
+ for (int i = 0; i < 8; ++i) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ *direction = i;
+ }
+ }
+ *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
+}
+
+// -------------------------------------------------------------------------
+// CdefFilter
+
+// Load 4 vectors based on the given |direction|.
+inline void LoadDirection(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ // Each |direction| describes a different set of source values. Expand this
+ // set by negating each set. For |direction| == 0 this gives a diagonal line
+ // from top right to bottom left. The first value is y, the second x. Negative
+ // y values move up.
+ // a b c d
+ // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
+ // c
+ // a
+ // 0
+ // b
+ // d
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
+ output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
+ output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
+ output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
+}
+
+// Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
+// do 2 rows at a time.
+void LoadDirection4(const uint16_t* const src, const ptrdiff_t stride,
+ __m128i* output, const int direction) {
+ const int y_0 = kCdefDirections[direction][0][0];
+ const int x_0 = kCdefDirections[direction][0][1];
+ const int y_1 = kCdefDirections[direction][1][0];
+ const int x_1 = kCdefDirections[direction][1][1];
+ output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
+ src - y_0 * stride + stride - x_0);
+ output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
+ src + y_0 * stride + stride + x_0);
+ output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
+ src - y_1 * stride + stride - x_1);
+ output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
+ src + y_1 * stride + stride + x_1);
+}
+
+inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
+ const __m128i& damping, const __m256i& threshold) {
+ const __m256i diff = _mm256_sub_epi16(pixel, reference);
+ const __m256i abs_diff = _mm256_abs_epi16(diff);
+ // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
+ // 0, std::abs(diff))
+ const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
+ // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
+ // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
+ // larger than threshold. Subtract using saturation will return 0 when pixel
+ // == kCdefLargeValue.
+ static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
+ const __m256i thresh_minus_shifted_diff =
+ _mm256_subs_epu16(threshold, shifted_diff);
+ const __m256i clamp_abs_diff =
+ _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
+ // Restore the sign.
+ return _mm256_sign_epi16(clamp_abs_diff, diff);
+}
+
+inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
+ const __m256i& tap, const __m128i& damping,
+ const __m256i& threshold) {
+ const __m256i constrained = Constrain(val, pixel, damping, threshold);
+ return _mm256_mullo_epi16(constrained, tap);
+}
+
+template <int width, bool enable_primary = true, bool enable_secondary = true>
+void CdefFilter_AVX2(const uint16_t* src, const ptrdiff_t src_stride,
+ const int height, const int primary_strength,
+ const int secondary_strength, const int damping,
+ const int direction, void* dest,
+ const ptrdiff_t dst_stride) {
+ static_assert(width == 8 || width == 4, "Invalid CDEF width.");
+ static_assert(enable_primary || enable_secondary, "");
+ constexpr bool clipping_required = enable_primary && enable_secondary;
+ auto* dst = static_cast<uint8_t*>(dest);
+ __m128i primary_damping_shift, secondary_damping_shift;
+
+ // FloorLog2() requires input to be > 0.
+ // 8-bit damping range: Y: [3, 6], UV: [2, 5].
+ if (enable_primary) {
+ // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
+ // for UV filtering.
+ primary_damping_shift =
+ _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
+ }
+ if (enable_secondary) {
+ // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
+ // necessary.
+ assert(damping - FloorLog2(secondary_strength) >= 0);
+ secondary_damping_shift =
+ _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
+ }
+ const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
+ const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
+ const __m256i secondary_tap_0 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
+ const __m256i secondary_tap_1 =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
+ const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
+ _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
+ const __m256i primary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
+ const __m256i secondary_threshold =
+ _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
+
+ int y = height;
+ do {
+ __m128i pixel_128;
+ if (width == 8) {
+ pixel_128 = LoadUnaligned16(src);
+ } else {
+ pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
+ }
+
+ __m256i pixel = SetrM128i(pixel_128, pixel_128);
+
+ __m256i min = pixel;
+ __m256i max = pixel;
+ __m256i sum_pair;
+
+ if (enable_primary) {
+ // Primary |direction|.
+ __m128i primary_val_128[4];
+ if (width == 8) {
+ LoadDirection(src, src_stride, primary_val_128, direction);
+ } else {
+ LoadDirection4(src, src_stride, primary_val_128, direction);
+ }
+
+ __m256i primary_val[2];
+ primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
+ primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, primary_val[0]);
+ min = _mm256_min_epu16(min, primary_val[1]);
+
+ // The source is 16 bits, however, we only really care about the lower
+ // 8 bits. The upper 8 bits contain the "large" flag. After the final
+ // primary max has been calculated, zero out the upper 8 bits. Use this
+ // to find the "16 bit" max.
+ const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
+ max = _mm256_max_epu16(
+ max, _mm256_and_si256(max_p01, cdef_large_value_mask));
+ }
+
+ sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
+ primary_damping_shift, primary_threshold);
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
+ primary_damping_shift, primary_threshold));
+ } else {
+ sum_pair = _mm256_setzero_si256();
+ }
+
+ if (enable_secondary) {
+ // Secondary |direction| values (+/- 2). Clamp |direction|.
+ __m128i secondary_val_128[8];
+ if (width == 8) {
+ LoadDirection(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
+ } else {
+ LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
+ LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
+ }
+
+ __m256i secondary_val[4];
+ secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
+ secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
+ secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
+ secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
+
+ if (clipping_required) {
+ min = _mm256_min_epu16(min, secondary_val[0]);
+ min = _mm256_min_epu16(min, secondary_val[1]);
+ min = _mm256_min_epu16(min, secondary_val[2]);
+ min = _mm256_min_epu16(min, secondary_val[3]);
+
+ const __m256i max_s01 =
+ _mm256_max_epu8(secondary_val[0], secondary_val[1]);
+ const __m256i max_s23 =
+ _mm256_max_epu8(secondary_val[2], secondary_val[3]);
+ const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
+ max = _mm256_max_epu8(max,
+ _mm256_and_si256(max_s, cdef_large_value_mask));
+ }
+
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
+ secondary_damping_shift, secondary_threshold));
+ sum_pair = _mm256_add_epi16(
+ sum_pair,
+ ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
+ secondary_damping_shift, secondary_threshold));
+ }
+
+ __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
+ _mm256_extracti128_si256(sum_pair, 1));
+
+ // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
+ const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
+ // 8 + sum
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
+ // (... - (sum < 0)) >> 4
+ sum = _mm_add_epi16(sum, sum_lt_0);
+ sum = _mm_srai_epi16(sum, 4);
+ // pixel + ...
+ sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
+ if (clipping_required) {
+ const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
+ _mm256_extracti128_si256(min, 1));
+
+ const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
+ _mm256_extracti128_si256(max, 1));
+ // Clip3
+ sum = _mm_min_epi16(sum, max_128);
+ sum = _mm_max_epi16(sum, min_128);
+ }
+
+ const __m128i result = _mm_packus_epi16(sum, sum);
+ if (width == 8) {
+ src += src_stride;
+ StoreLo8(dst, result);
+ dst += dst_stride;
+ --y;
+ } else {
+ src += src_stride << 1;
+ Store4(dst, result);
+ dst += dst_stride;
+ Store4(dst, _mm_srli_si128(result, 4));
+ dst += dst_stride;
+ y -= 2;
+ }
+ } while (y != 0);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
+ assert(dsp != nullptr);
+ dsp->cdef_direction = CdefDirection_AVX2;
+
+ dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
+ dsp->cdef_filters[0][1] =
+ CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
+ dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
+ dsp->cdef_filters[1][1] =
+ CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
+ dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void CdefInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/libgav1/src/dsp/x86/cdef_avx2.h b/libgav1/src/dsp/x86/cdef_avx2.h
new file mode 100644
index 0000000..41f2d3f
--- /dev/null
+++ b/libgav1/src/dsp/x86/cdef_avx2.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cdef_direction and Dsp::cdef_filters. This function is not
+// thread-safe.
+void CdefInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
+#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
+#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CDEF_AVX2_H_
diff --git a/libgav1/src/dsp/x86/cdef_sse4.cc b/libgav1/src/dsp/x86/cdef_sse4.cc
index 4478bde..6ede778 100644
--- a/libgav1/src/dsp/x86/cdef_sse4.cc
+++ b/libgav1/src/dsp/x86/cdef_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/cdef.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <emmintrin.h>
#include <tmmintrin.h>
@@ -349,8 +349,8 @@
inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
const __m128i division_table[2]) {
// Reverse and clear upper 2 bytes.
- const __m128i reverser =
- _mm_set_epi32(0x80800100, 0x03020504, 0x07060908, 0x0b0a0d0c);
+ const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
+ 0x03020504, 0x07060908, 0x0b0a0d0c);
// 14 13 12 11 10 09 08 ZZ
const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
// 00 14 01 13 02 12 03 11
@@ -371,7 +371,8 @@
const __m128i division_table[2]) {
// Reverse and clear upper 10 bytes.
const __m128i reverser =
- _mm_set_epi32(0x80808080, 0x80808080, 0x80800100, 0x03020504);
+ _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
+ static_cast<int>(0x80800100), 0x03020504);
// 10 09 08 ZZ ZZ ZZ ZZ ZZ
const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
// 00 10 01 09 02 08 03 ZZ
@@ -395,7 +396,7 @@
}
void CdefDirection_SSE4_1(const void* const source, ptrdiff_t stride,
- int* const direction, int* const variance) {
+ uint8_t* const direction, int* const variance) {
assert(direction != nullptr);
assert(variance != nullptr);
const auto* src = static_cast<const uint8_t*>(source);
@@ -414,8 +415,8 @@
cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
const __m128i division_table_odd[2] = {
- LoadUnaligned16(kCdefDivisionTableOddPadded),
- LoadUnaligned16(kCdefDivisionTableOddPadded + 4)};
+ LoadAligned16(kCdefDivisionTableOddPadded),
+ LoadAligned16(kCdefDivisionTableOddPadded + 4)};
cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
@@ -717,7 +718,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -725,4 +726,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/cdef_sse4.h b/libgav1/src/dsp/x86/cdef_sse4.h
index 2593c72..6631eb7 100644
--- a/libgav1/src/dsp/x86/cdef_sse4.h
+++ b/libgav1/src/dsp/x86/cdef_sse4.h
@@ -30,9 +30,16 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_CdefDirection
#define LIBGAV1_Dsp8bpp_CdefDirection LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_CdefFilters
#define LIBGAV1_Dsp8bpp_CdefFilters LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_CDEF_SSE4_H_
diff --git a/libgav1/src/dsp/x86/common_avx2.h b/libgav1/src/dsp/x86/common_avx2.h
new file mode 100644
index 0000000..373116a
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_avx2.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
+
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+
+#include <immintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+namespace libgav1 {
+namespace dsp {
+namespace avx2 {
+
+#include "src/dsp/x86/common_avx2.inc"
+#include "src/dsp/x86/common_sse4.inc"
+
+} // namespace avx2
+
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_avx2.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+
+// common_sse4.inc
+using avx2::Load2;
+using avx2::Load2x2;
+using avx2::Load4;
+using avx2::Load4x2;
+using avx2::LoadAligned16;
+using avx2::LoadAligned16Msan;
+using avx2::LoadHi8;
+using avx2::LoadHi8Msan;
+using avx2::LoadLo8;
+using avx2::LoadLo8Msan;
+using avx2::LoadUnaligned16;
+using avx2::LoadUnaligned16Msan;
+using avx2::MaskHighNBytes;
+using avx2::RightShiftWithRounding_S16;
+using avx2::RightShiftWithRounding_S32;
+using avx2::RightShiftWithRounding_U16;
+using avx2::RightShiftWithRounding_U32;
+using avx2::Store2;
+using avx2::Store4;
+using avx2::StoreAligned16;
+using avx2::StoreHi8;
+using avx2::StoreLo8;
+using avx2::StoreUnaligned16;
+
+// common_avx2.inc
+using avx2::LoadAligned32;
+using avx2::LoadAligned32Msan;
+using avx2::LoadAligned64;
+using avx2::LoadAligned64Msan;
+using avx2::LoadUnaligned32;
+using avx2::LoadUnaligned32Msan;
+using avx2::SetrM128i;
+using avx2::StoreAligned32;
+using avx2::StoreAligned64;
+using avx2::StoreUnaligned32;
+// NOLINTEND
+
+} // namespace dsp
+} // namespace libgav1
+
+#endif // LIBGAV1_TARGETING_AVX2
+#endif // LIBGAV1_SRC_DSP_X86_COMMON_AVX2_H_
diff --git a/libgav1/src/dsp/x86/common_avx2.inc b/libgav1/src/dsp/x86/common_avx2.inc
new file mode 100644
index 0000000..53b4e2e
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_avx2.inc
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Compatibility functions.
+
+inline __m256i SetrM128i(const __m128i lo, const __m128i hi) {
+ // For compatibility with older gcc toolchains (< 8) use
+ // _mm256_inserti128_si256 over _mm256_setr_m128i. Newer gcc implementations
+ // are implemented similarly to the following, clang uses a different method
+ // but no differences in assembly have been observed.
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m256i LoadAligned32(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ return _mm256_load_si256(static_cast<const __m256i*>(a));
+}
+
+inline void LoadAligned64(const void* a, __m256i dst[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ dst[0] = _mm256_load_si256(static_cast<const __m256i*>(a) + 0);
+ dst[1] = _mm256_load_si256(static_cast<const __m256i*>(a) + 1);
+}
+
+inline __m256i LoadUnaligned32(const void* a) {
+ return _mm256_loadu_si256(static_cast<const __m256i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m256i MaskOverreads(const __m256i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes >= 32) return _mm256_setzero_si256();
+ if (over_read_in_bytes > 0) {
+ __m128i m = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes % 16; ++i) {
+ m = _mm_srli_si128(m, 1);
+ }
+ const __m256i mask = (over_read_in_bytes < 16)
+ ? SetrM128i(_mm_set1_epi8(-1), m)
+ : SetrM128i(m, _mm_setzero_si128());
+ dst = _mm256_and_si256(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m256i LoadAligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+}
+
+inline void LoadAligned64Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i dst[2]) {
+ dst[0] = MaskOverreads(LoadAligned32(source), over_read_in_bytes);
+ dst[1] = MaskOverreads(LoadAligned32(static_cast<const __m256i*>(source) + 1),
+ over_read_in_bytes);
+}
+
+inline __m256i LoadUnaligned32Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned32(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void StoreAligned32(void* a, const __m256i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a), v);
+}
+
+inline void StoreAligned64(void* a, const __m256i v[2]) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0x1f) == 0);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 0, v[0]);
+ _mm256_store_si256(static_cast<__m256i*>(a) + 1, v[1]);
+}
+
+inline void StoreUnaligned32(void* a, const __m256i v) {
+ _mm256_storeu_si256(static_cast<__m256i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m256i RightShiftWithRounding_S16(const __m256i v_val_d, int bits) {
+ assert(bits <= 16);
+ const __m256i v_bias_d =
+ _mm256_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m256i v_tmp_d = _mm256_add_epi16(v_val_d, v_bias_d);
+ return _mm256_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m256i RightShiftWithRounding_S32(const __m256i v_val_d, int bits) {
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i v_tmp_d = _mm256_add_epi32(v_val_d, v_bias_d);
+ return _mm256_srai_epi32(v_tmp_d, bits);
+}
diff --git a/libgav1/src/dsp/x86/common_sse4.h b/libgav1/src/dsp/x86/common_sse4.h
index 24c801f..41a3a68 100644
--- a/libgav1/src/dsp/x86/common_sse4.h
+++ b/libgav1/src/dsp/x86/common_sse4.h
@@ -20,14 +20,14 @@
#include "src/utils/compiler_attributes.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <emmintrin.h>
#include <smmintrin.h>
#include <cassert>
+#include <cstddef>
#include <cstdint>
-#include <cstdlib>
#include <cstring>
#if 0
@@ -70,189 +70,61 @@
#define PR(var, N) PrintReg(var, #var, N)
#define PD(var) PrintReg(var, #var);
#define PX(var) PrintRegX(var, #var);
+
+#if LIBGAV1_MSAN
+#include <sanitizer/msan_interface.h>
+
+inline void PrintShadow(const void* r, const char* const name,
+ const size_t size) {
+ fprintf(stderr, "Shadow for %s:\n", name);
+ __msan_print_shadow(r, size);
+}
+#define PS(var, N) PrintShadow(var, #var, N)
+
+#endif // LIBGAV1_MSAN
+
#endif // 0
namespace libgav1 {
namespace dsp {
+namespace sse4 {
-//------------------------------------------------------------------------------
-// Load functions.
+#include "src/dsp/x86/common_sse4.inc"
-inline __m128i Load2(const void* src) {
- int16_t val;
- memcpy(&val, src, sizeof(val));
- return _mm_cvtsi32_si128(val);
-}
+} // namespace sse4
-inline __m128i Load2x2(const void* src1, const void* src2) {
- uint16_t val1;
- uint16_t val2;
- memcpy(&val1, src1, sizeof(val1));
- memcpy(&val2, src2, sizeof(val2));
- return _mm_cvtsi32_si128(val1 | (val2 << 16));
-}
-
-// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
-template <int lane>
-inline __m128i Load2(const void* const buf, __m128i val) {
- uint16_t temp;
- memcpy(&temp, buf, 2);
- return _mm_insert_epi16(val, temp, lane);
-}
-
-inline __m128i Load4(const void* src) {
- // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
- // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
- // movss instruction.
- //
- // Until compiler support of _mm_loadu_si32 is widespread, use of
- // _mm_loadu_si32 is banned.
- int val;
- memcpy(&val, src, sizeof(val));
- return _mm_cvtsi32_si128(val);
-}
-
-inline __m128i Load4x2(const void* src1, const void* src2) {
- // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
- // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
- // movss instruction.
- //
- // Until compiler support of _mm_loadu_si32 is widespread, use of
- // _mm_loadu_si32 is banned.
- int val1, val2;
- memcpy(&val1, src1, sizeof(val1));
- memcpy(&val2, src2, sizeof(val2));
- return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
-}
-
-inline __m128i LoadLo8(const void* a) {
- return _mm_loadl_epi64(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadHi8(const __m128i v, const void* a) {
- const __m128 x =
- _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
- return _mm_castps_si128(x);
-}
-
-inline __m128i LoadUnaligned16(const void* a) {
- return _mm_loadu_si128(static_cast<const __m128i*>(a));
-}
-
-inline __m128i LoadAligned16(const void* a) {
- assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
- return _mm_load_si128(static_cast<const __m128i*>(a));
-}
-
-//------------------------------------------------------------------------------
-// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
-
-inline __m128i MaskOverreads(const __m128i source,
- const int over_read_in_bytes) {
- __m128i dst = source;
-#if LIBGAV1_MSAN
- if (over_read_in_bytes > 0) {
- __m128i mask = _mm_set1_epi8(-1);
- for (int i = 0; i < over_read_in_bytes; ++i) {
- mask = _mm_srli_si128(mask, 1);
- }
- dst = _mm_and_si128(dst, mask);
- }
-#else
- static_cast<void>(over_read_in_bytes);
-#endif
- return dst;
-}
-
-inline __m128i LoadLo8Msan(const void* const source,
- const int over_read_in_bytes) {
- return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
-}
-
-inline __m128i LoadAligned16Msan(const void* const source,
- const int over_read_in_bytes) {
- return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
-}
-
-inline __m128i LoadUnaligned16Msan(const void* const source,
- const int over_read_in_bytes) {
- return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
-}
-
-//------------------------------------------------------------------------------
-// Store functions.
-
-inline void Store2(void* dst, const __m128i x) {
- const int val = _mm_cvtsi128_si32(x);
- memcpy(dst, &val, 2);
-}
-
-inline void Store4(void* dst, const __m128i x) {
- const int val = _mm_cvtsi128_si32(x);
- memcpy(dst, &val, sizeof(val));
-}
-
-inline void StoreLo8(void* a, const __m128i v) {
- _mm_storel_epi64(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreHi8(void* a, const __m128i v) {
- _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
-}
-
-inline void StoreAligned16(void* a, const __m128i v) {
- _mm_store_si128(static_cast<__m128i*>(a), v);
-}
-
-inline void StoreUnaligned16(void* a, const __m128i v) {
- _mm_storeu_si128(static_cast<__m128i*>(a), v);
-}
-
-//------------------------------------------------------------------------------
-// Arithmetic utilities.
-
-inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
- assert(bits <= 16);
- const __m128i v_bias_d =
- _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
- const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
- return _mm_srli_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
- assert(bits <= 16);
- const __m128i v_bias_d =
- _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
- const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
- return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-//------------------------------------------------------------------------------
-// Masking utilities
-inline __m128i MaskHighNBytes(int n) {
- static constexpr uint8_t kMask[32] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
- 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
- };
-
- return LoadUnaligned16(kMask + n);
-}
+// NOLINTBEGIN(misc-unused-using-decls)
+// These function aliases shall not be visible to external code. They are
+// restricted to x86/*_sse4.cc files only. This scheme exists to distinguish two
+// possible implementations of common functions, which may differ based on
+// whether the compiler is permitted to use avx2 instructions.
+using sse4::Load2;
+using sse4::Load2x2;
+using sse4::Load4;
+using sse4::Load4x2;
+using sse4::LoadAligned16;
+using sse4::LoadAligned16Msan;
+using sse4::LoadHi8;
+using sse4::LoadHi8Msan;
+using sse4::LoadLo8;
+using sse4::LoadLo8Msan;
+using sse4::LoadUnaligned16;
+using sse4::LoadUnaligned16Msan;
+using sse4::MaskHighNBytes;
+using sse4::RightShiftWithRounding_S16;
+using sse4::RightShiftWithRounding_S32;
+using sse4::RightShiftWithRounding_U16;
+using sse4::RightShiftWithRounding_U32;
+using sse4::Store2;
+using sse4::Store4;
+using sse4::StoreAligned16;
+using sse4::StoreHi8;
+using sse4::StoreLo8;
+using sse4::StoreUnaligned16;
+// NOLINTEND
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_COMMON_SSE4_H_
diff --git a/libgav1/src/dsp/x86/common_sse4.inc b/libgav1/src/dsp/x86/common_sse4.inc
new file mode 100644
index 0000000..35c56b8
--- /dev/null
+++ b/libgav1/src/dsp/x86/common_sse4.inc
@@ -0,0 +1,206 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//------------------------------------------------------------------------------
+// Load functions.
+
+inline __m128i Load2(const void* src) {
+ int16_t val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load2x2(const void* src1, const void* src2) {
+ uint16_t val1;
+ uint16_t val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_cvtsi32_si128(val1 | (val2 << 16));
+}
+
+// Load 2 uint8_t values into |lane| * 2 and |lane| * 2 + 1.
+template <int lane>
+inline __m128i Load2(const void* const buf, __m128i val) {
+ int16_t temp;
+ memcpy(&temp, buf, 2);
+ return _mm_insert_epi16(val, temp, lane);
+}
+
+inline __m128i Load4(const void* src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+inline __m128i Load4x2(const void* src1, const void* src2) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val1, val2;
+ memcpy(&val1, src1, sizeof(val1));
+ memcpy(&val2, src2, sizeof(val2));
+ return _mm_insert_epi32(_mm_cvtsi32_si128(val1), val2, 1);
+}
+
+inline __m128i LoadLo8(const void* a) {
+ return _mm_loadl_epi64(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadHi8(const __m128i v, const void* a) {
+ const __m128 x =
+ _mm_loadh_pi(_mm_castsi128_ps(v), static_cast<const __m64*>(a));
+ return _mm_castps_si128(x);
+}
+
+inline __m128i LoadUnaligned16(const void* a) {
+ return _mm_loadu_si128(static_cast<const __m128i*>(a));
+}
+
+inline __m128i LoadAligned16(const void* a) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ return _mm_load_si128(static_cast<const __m128i*>(a));
+}
+
+//------------------------------------------------------------------------------
+// Load functions to avoid MemorySanitizer's use-of-uninitialized-value warning.
+
+inline __m128i MaskOverreads(const __m128i source,
+ const ptrdiff_t over_read_in_bytes) {
+ __m128i dst = source;
+#if LIBGAV1_MSAN
+ if (over_read_in_bytes > 0) {
+ __m128i mask = _mm_set1_epi8(-1);
+ for (ptrdiff_t i = 0; i < over_read_in_bytes; ++i) {
+ mask = _mm_srli_si128(mask, 1);
+ }
+ dst = _mm_and_si128(dst, mask);
+ }
+#else
+ static_cast<void>(over_read_in_bytes);
+#endif
+ return dst;
+}
+
+inline __m128i LoadLo8Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadLo8(source), over_read_in_bytes + 8);
+}
+
+inline __m128i LoadHi8Msan(const __m128i v, const void* source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadHi8(v, source), over_read_in_bytes);
+}
+
+inline __m128i LoadAligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadAligned16(source), over_read_in_bytes);
+}
+
+inline __m128i LoadUnaligned16Msan(const void* const source,
+ const ptrdiff_t over_read_in_bytes) {
+ return MaskOverreads(LoadUnaligned16(source), over_read_in_bytes);
+}
+
+//------------------------------------------------------------------------------
+// Store functions.
+
+inline void Store2(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, 2);
+}
+
+inline void Store4(void* dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+inline void StoreLo8(void* a, const __m128i v) {
+ _mm_storel_epi64(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreHi8(void* a, const __m128i v) {
+ _mm_storeh_pi(static_cast<__m64*>(a), _mm_castsi128_ps(v));
+}
+
+inline void StoreAligned16(void* a, const __m128i v) {
+ assert((reinterpret_cast<uintptr_t>(a) & 0xf) == 0);
+ _mm_store_si128(static_cast<__m128i*>(a), v);
+}
+
+inline void StoreUnaligned16(void* a, const __m128i v) {
+ _mm_storeu_si128(static_cast<__m128i*>(a), v);
+}
+
+//------------------------------------------------------------------------------
+// Arithmetic utilities.
+
+inline __m128i RightShiftWithRounding_U16(const __m128i v_val_d, int bits) {
+ assert(bits <= 16);
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, _mm_setzero_si128());
+}
+
+inline __m128i RightShiftWithRounding_S16(const __m128i v_val_d, int bits) {
+ assert(bits < 16);
+ const __m128i v_bias_d =
+ _mm_set1_epi16(static_cast<int16_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_U32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+inline __m128i RightShiftWithRounding_S32(const __m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// Use this when |bits| is not an immediate value.
+inline __m128i VariableRightShiftWithRounding_S32(const __m128i v_val_d,
+ int bits) {
+ const __m128i v_bias_d =
+ _mm_set1_epi32(static_cast<int32_t>((1 << bits) >> 1));
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_sra_epi32(v_tmp_d, _mm_cvtsi32_si128(bits));
+}
+
+//------------------------------------------------------------------------------
+// Masking utilities
+inline __m128i MaskHighNBytes(int n) {
+ static constexpr uint8_t kMask[32] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ };
+
+ return LoadUnaligned16(kMask + n);
+}
diff --git a/libgav1/src/dsp/x86/convolve_avx2.cc b/libgav1/src/dsp/x86/convolve_avx2.cc
new file mode 100644
index 0000000..2ecb77c
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_avx2.cc
@@ -0,0 +1,1544 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/convolve.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+#include "src/dsp/x86/convolve_sse4.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m256i SumOnePassTaps(const __m256i* const src, const __m256i* const taps) {
+ __m256i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m256i v_madd_21 = _mm256_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m256i v_madd_43 = _mm256_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m256i v_madd_65 = _mm256_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm256_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm256_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m256i v_madd_10 = _mm256_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m256i v_madd_76 = _mm256_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m256i v_sum_3210 = _mm256_add_epi16(v_madd_10, v_madd_32);
+ const __m256i v_sum_7654 = _mm256_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm256_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm256_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m256i v_madd_32 = _mm256_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m256i v_madd_54 = _mm256_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm256_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m256i SumHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i v_src[4];
+ const __m256i src_long = *src;
+ const __m256i src_long_dup_lo = _mm256_unpacklo_epi8(src_long, src_long);
+ const __m256i src_long_dup_hi = _mm256_unpackhi_epi8(src_long, src_long);
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[2] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ v_src[3] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
+ v_src[1] = _mm256_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
+ }
+ return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index>
+__m256i SimpleHorizontalTaps(const __m256i* const src,
+ const __m256i* const v_tap) {
+ __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm256_add_epi16(sum, _mm256_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm256_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m256i HorizontalTaps8To16(const __m256i* const src,
+ const __m256i* const v_tap) {
+ const __m256i sum = SumHorizontalTaps<filter_index>(src, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+// Filter 2xh sizes.
+template <int num_taps, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int /*width*/, const int height,
+ const __m128i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ // Horizontal passes only need to account for |num_taps| 2 and 4 when
+ // |width| <= 4.
+ assert(num_taps <= 4);
+ if (num_taps <= 4) {
+ if (!is_compound) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ if (is_2d) {
+ const __m128i sum =
+ HorizontalTaps8To16_2x2<filter_index>(src, src_stride, v_tap);
+ Store4(&dest16[0], sum);
+ dest16 += pred_stride;
+ Store4(&dest16[0], _mm_srli_si128(sum, 8));
+ dest16 += pred_stride;
+ } else {
+ const __m128i sum =
+ SimpleHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+ Store2(dest8, sum);
+ dest8 += pred_stride;
+ Store2(dest8, _mm_srli_si128(sum, 4));
+ dest8 += pred_stride;
+ }
+
+ src += src_stride << 1;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| because the horizontal pass
+ // generates context for the vertical pass.
+ if (is_2d) {
+ assert(height % 2 == 1);
+ __m128i sum;
+ const __m128i input = LoadLo8(&src[2]);
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_43 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
+ sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ } else {
+ // 02 03 03 04 04 05 05 06 06 07 ....
+ const __m128i v_src_32 =
+ _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
+ // 04 05 05 06 06 07 07 08 ...
+ const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
+ const __m128i v_madd_32 =
+ _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 =
+ _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_54, v_madd_32);
+ }
+ sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+ Store4(dest16, sum);
+ }
+ }
+ }
+}
+
+// Filter widths >= 4.
+template <int num_taps, int filter_index, bool is_2d = false,
+ bool is_compound = false>
+void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dest, const ptrdiff_t pred_stride,
+ const int width, const int height,
+ const __m256i* const v_tap) {
+ auto* dest8 = static_cast<uint8_t*>(dest);
+ auto* dest16 = static_cast<uint16_t*>(dest);
+
+ if (width >= 32) {
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ if (is_2d || is_compound) {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[x]), LoadUnaligned16(&src[x + 8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(LoadUnaligned16(&src[x + 16]),
+ LoadUnaligned16(&src[x + 24]));
+ const __m256i result2 =
+ HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[x], result);
+ StoreAligned32(&dest16[x + 16], result2);
+ } else {
+ StoreUnaligned32(&dest16[x], result);
+ StoreUnaligned32(&dest16[x + 16], result2);
+ }
+ } else {
+ // Load src used to calculate dest8[7:0] and dest8[23:16].
+ const __m256i src_long = LoadUnaligned32(&src[x]);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ // Load src used to calculate dest8[15:8] and dest8[31:24].
+ const __m256i src_long2 = LoadUnaligned32(&src[x + 8]);
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ // Combine results and store.
+ StoreUnaligned32(&dest8[x], _mm256_unpacklo_epi64(result, result2));
+ }
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ dest8 += pred_stride;
+ dest16 += pred_stride;
+ } while (--y != 0);
+ } else if (width == 16) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ if (is_2d || is_compound) {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 =
+ SetrM128i(LoadUnaligned16(&src[src_stride]),
+ LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ HorizontalTaps8To16<filter_index>(&src_long2, v_tap);
+ if (is_2d) {
+ StoreAligned32(&dest16[0], result);
+ StoreAligned32(&dest16[pred_stride], result2);
+ } else {
+ StoreUnaligned32(&dest16[0], result);
+ StoreUnaligned32(&dest16[pred_stride], result2);
+ }
+ } else {
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(LoadUnaligned16(&src[0]),
+ LoadUnaligned16(&src[src_stride]));
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ const __m256i src_long2 = SetrM128i(
+ LoadUnaligned16(&src[8]), LoadUnaligned16(&src[8 + src_stride]));
+ const __m256i result2 =
+ SimpleHorizontalTaps<filter_index>(&src_long2, v_tap);
+ const __m256i packed_result = _mm256_unpacklo_epi64(result, result2);
+ StoreUnaligned16(&dest8[0], _mm256_castsi256_si128(packed_result));
+ StoreUnaligned16(&dest8[pred_stride],
+ _mm256_extracti128_si256(packed_result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long =
+ SetrM128i(LoadUnaligned16(&src[0]), LoadUnaligned16(&src[8]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreAligned32(&dest16[0], result);
+ }
+
+ } else if (width == 8) {
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ if (is_2d || is_compound) {
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ if (is_2d) {
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreAligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ } else {
+ StoreUnaligned16(&dest16[0], _mm256_castsi256_si128(result));
+ StoreUnaligned16(&dest16[pred_stride],
+ _mm256_extracti128_si256(result, 1));
+ }
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest8[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreAligned16(&dest16[0], _mm256_castsi256_si128(result));
+ }
+
+ } else { // width == 4
+ int y = height;
+ if (is_2d) y -= 1;
+ do {
+ // Load into 2 128 bit lanes.
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ if (is_2d || is_compound) {
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ StoreLo8(&dest16[pred_stride], _mm256_extracti128_si256(result, 1));
+ } else {
+ const __m128i this_row = LoadUnaligned16(&src[0]);
+ const __m128i next_row = LoadUnaligned16(&src[src_stride]);
+ // Load into 2 128 bit lanes.
+ const __m256i src_long = SetrM128i(this_row, next_row);
+ const __m256i result =
+ SimpleHorizontalTaps<filter_index>(&src_long, v_tap);
+ Store4(&dest8[0], _mm256_castsi256_si128(result));
+ Store4(&dest8[pred_stride], _mm256_extracti128_si256(result, 1));
+ }
+ src += src_stride * 2;
+ dest8 += pred_stride * 2;
+ dest16 += pred_stride * 2;
+ y -= 2;
+ } while (y != 0);
+
+ // The 2d filters have an odd |height| during the horizontal pass, so
+ // filter the remaining row.
+ if (is_2d) {
+ const __m256i src_long = _mm256_castsi128_si256(LoadUnaligned16(&src[0]));
+ const __m256i result =
+ HorizontalTaps8To16<filter_index>(&src_long, v_tap);
+ StoreLo8(&dest16[0], _mm256_castsi256_si128(result));
+ }
+ }
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m256i* v_tap) {
+ if (num_taps == 8) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
+ v_tap[3] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 12)); // k7k6
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(*filter); // k1k0
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ v_tap[3] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 6)); // k7k6
+ }
+ } else if (num_taps == 6) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 2)); // k2k1
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
+ v_tap[2] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 10)); // k6k5
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 1)); // k2k1
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ v_tap[2] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 5)); // k6k5
+ }
+ } else if (num_taps == 4) {
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 4)); // k3k2
+ v_tap[1] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 8)); // k5k4
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 2)); // k3k2
+ v_tap[1] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 4)); // k5k4
+ }
+ } else { // num_taps == 2
+ if (is_2d_vertical) {
+ v_tap[0] = _mm256_broadcastd_epi32(_mm_srli_si128(*filter, 6)); // k4k3
+ } else {
+ v_tap[0] = _mm256_broadcastw_epi16(_mm_srli_si128(*filter, 3)); // k4k3
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m256i SimpleSum2DVerticalTaps(const __m256i* const src,
+ const __m256i* const taps) {
+ __m256i sum_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m256i sum_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m256i madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m256i madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo =
+ _mm256_madd_epi16(_mm256_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi =
+ _mm256_madd_epi16(_mm256_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm256_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm256_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm256_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical16xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m256i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m256i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned32(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ const __m128i packed_sum = _mm_packus_epi16(
+ _mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
+ StoreUnaligned16(dst8_x, packed_sum);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 16;
+ } while (x < width);
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass2xH(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m128i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+template <bool is_2d = false, bool is_compound = false>
+LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
+ const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
+ const ptrdiff_t dst_stride, const int width, const int height,
+ const int filter_id, const int filter_index) {
+ assert(filter_id != 0);
+ __m256i v_tap[4];
+ const __m128i v_horizontal_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+
+ if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 1) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 0) { // 6 tap.
+ SetupTaps<6>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else if (filter_index == 5) { // 4 tap.
+ SetupTaps<4>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ } else { // 2 tap.
+ SetupTaps<2>(&v_horizontal_filter, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
+ }
+}
+
+void Convolve2D_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ if (width > 2) {
+ DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result,
+ width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH</*is_2d=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+ }
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2>(intermediate_result, dest, dest_stride, width,
+ height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<8>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<6>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<4>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 2) {
+ Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else if (width == 4) {
+ Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
+ taps);
+ } else {
+ Filter2DVertical<2>(intermediate_result, dest, dest_stride, width,
+ height, taps);
+ }
+ }
+ }
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m256i Compound1DShift(const __m256i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index, bool unpack_high = false>
+__m256i SumVerticalTaps(const __m256i* const srcs, const __m256i* const v_tap) {
+ __m256i v_src[4];
+
+ if (!unpack_high) {
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ } else {
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm256_unpackhi_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm256_unpackhi_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm256_unpackhi_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm256_unpackhi_epi8(srcs[2], srcs[3]);
+ }
+ }
+ return SumOnePassTaps<filter_index>(v_src, v_tap);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical32xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int width, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+ assert(width >= 32);
+ int x = 0;
+ do {
+ const uint8_t* src_x = src + x;
+ __m256i srcs[8];
+ srcs[0] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadUnaligned32(src_x);
+ src_x += src_stride;
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+ StoreUnaligned32(dst16_x, results);
+ StoreUnaligned32(dst16_x + 16, results_hi);
+ dst16_x += dst_stride;
+ } else {
+ const __m256i results =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+
+ StoreUnaligned32(dst8_x, packed_results);
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 32;
+ } while (x < width);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical16xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadUnaligned16(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m256i sums_hi =
+ SumVerticalTaps<filter_index, /*unpack_high=*/true>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x20));
+ const __m256i results_hi =
+ Compound1DShift(_mm256_permute2x128_si256(sums, sums_hi, 0x31));
+
+ StoreUnaligned32(dst16, results);
+ StoreUnaligned32(dst16 + dst_stride, results_hi);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i results_hi =
+ RightShiftWithRounding_S16(sums_hi, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results_hi);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreUnaligned16(dst8, this_dst);
+ StoreUnaligned16(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m256i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m256i srcs[8 + 1];
+ // The upper 128 bits hold the filter data for the next row.
+ srcs[0] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[0] =
+ _mm256_inserti128_si256(srcs[0], _mm256_castsi256_si128(srcs[1]), 1);
+ srcs[2] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[1] =
+ _mm256_inserti128_si256(srcs[1], _mm256_castsi256_si128(srcs[2]), 1);
+ if (num_taps >= 6) {
+ srcs[3] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[2] =
+ _mm256_inserti128_si256(srcs[2], _mm256_castsi256_si128(srcs[3]), 1);
+ srcs[4] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[3] =
+ _mm256_inserti128_si256(srcs[3], _mm256_castsi256_si128(srcs[4]), 1);
+ if (num_taps == 8) {
+ srcs[5] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[4] = _mm256_inserti128_si256(srcs[4],
+ _mm256_castsi256_si128(srcs[5]), 1);
+ srcs[6] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+ srcs[5] = _mm256_inserti128_si256(srcs[5],
+ _mm256_castsi256_si128(srcs[6]), 1);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row - 1] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 2] = _mm256_inserti128_si256(
+ srcs[next_row - 2], _mm256_castsi256_si128(srcs[next_row - 1]), 1);
+
+ srcs[next_row] = _mm256_castsi128_si256(LoadLo8(src_x));
+ src_x += src_stride;
+
+ srcs[next_row - 1] = _mm256_inserti128_si256(
+ srcs[next_row - 1], _mm256_castsi256_si128(srcs[next_row]), 1);
+
+ const __m256i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m256i results = Compound1DShift(sums);
+ const __m128i this_dst = _mm256_castsi256_si128(results);
+ const auto next_dst = _mm256_extracti128_si256(results, 1);
+
+ StoreUnaligned16(dst16, this_dst);
+ StoreUnaligned16(dst16 + dst_stride, next_dst);
+ dst16 += dst_stride << 1;
+ } else {
+ const __m256i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m256i packed_results = _mm256_packus_epi16(results, results);
+ const __m128i this_dst = _mm256_castsi256_si128(packed_results);
+ const auto next_dst = _mm256_extracti128_si256(packed_results, 1);
+
+ StoreLo8(dst8, this_dst);
+ StoreLo8(dst8 + dst_stride, next_dst);
+ dst8 += dst_stride << 1;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+template <int filter_index, bool is_compound = false>
+void FilterVertical8xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int /*width*/, const int height,
+ const __m128i* const v_tap) {
+ const int num_taps = GetNumTapsInFilter(filter_index);
+ const int next_row = num_taps - 1;
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ const uint8_t* src_x = src;
+ __m128i srcs[8];
+ srcs[0] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadLo8(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadLo8(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadLo8(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadLo8(src_x);
+ src_x += src_stride;
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += dst_stride;
+ } else {
+ const __m128i results = RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ StoreLo8(dst8, _mm_packus_epi16(results, results));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+}
+
+void ConvolveVertical_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int vertical_filter_index,
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<0>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<2>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<3>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<4>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ } else {
+ FilterVertical32xH<5>(src, src_stride, dest, dest_stride, width, height,
+ taps_256);
+ }
+ }
+ } else { // width <= 8
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+ if (width == 2) {
+ FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height,
+ taps);
+ } else {
+ FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height,
+ taps);
+ }
+ }
+ }
+}
+
+void ConvolveCompoundVertical_AVX2(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int vertical_filter_index,
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
+ const int filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(filter_index);
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = width;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 4.
+ if (width > 4) {
+ __m256i taps_256[4];
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<0, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<2, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<3, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<4, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ } else {
+ SetupTaps<4>(&v_filter, taps_256);
+ if (width == 8) {
+ FilterVertical8xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else if (width == 16) {
+ FilterVertical16xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ } else {
+ FilterVertical32xH<5, /*is_compound=*/true>(
+ src, src_stride, dest, dest_stride, width, height, taps_256);
+ }
+ }
+ } else { // width <= 4
+ // Use 128 bit code.
+ __m128i taps[4];
+
+ if (filter_index < 2) { // 6 tap.
+ SetupTaps<6>(&v_filter, taps);
+ FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 2) { // 8 tap.
+ SetupTaps<8>(&v_filter, taps);
+ FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 3) { // 2 tap.
+ SetupTaps<2>(&v_filter, taps);
+ FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else if (filter_index == 4) { // 4 tap.
+ SetupTaps<4>(&v_filter, taps);
+ FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ } else {
+ SetupTaps<4>(&v_filter, taps);
+ FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest,
+ dest_stride, height, taps);
+ }
+ }
+}
+
+void ConvolveHorizontal_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int /*vertical_filter_index*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width > 2) {
+ DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ } else {
+ // Use non avx2 version for smaller widths.
+ DoHorizontalPass2xH(src, reference_stride, dest, pred_stride, width, height,
+ horizontal_filter_id, filter_index);
+ }
+}
+
+void ConvolveCompoundHorizontal_AVX2(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int horizontal_filter_index, const int /*vertical_filter_index*/,
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int filter_index = GetFilterIndex(horizontal_filter_index, width);
+ // Set |src| to the outermost tap.
+ const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // All compound functions output to the predictor buffer with |pred_stride|
+ // equal to |width|.
+ assert(pred_stride == width);
+ // Compound functions start at 4x4.
+ assert(width >= 4 && height >= 4);
+
+#ifdef NDEBUG
+ // Quiet compiler error.
+ (void)pred_stride;
+#endif
+
+ DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
+ filter_index);
+}
+
+void ConvolveCompound2D_AVX2(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
+ const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
+ const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
+ const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
+
+ // The output of the horizontal filter is guaranteed to fit in 16 bits.
+ alignas(32) uint16_t
+ intermediate_result[kMaxSuperBlockSizeInPixels *
+ (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
+ const int intermediate_height = height + vertical_taps - 1;
+
+ const ptrdiff_t src_stride = reference_stride;
+ const auto* src = static_cast<const uint8_t*>(reference) -
+ (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
+ DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
+ src, src_stride, intermediate_result, width, width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
+
+ // Vertical filter.
+ auto* dest = static_cast<uint8_t*>(prediction);
+ const ptrdiff_t dest_stride = pred_stride;
+ assert(vertical_filter_id != 0);
+
+ const __m128i v_filter =
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
+
+ // Use 256 bits for width > 8.
+ if (width > 8) {
+ __m256i taps_256[4];
+ const __m128i v_filter_ext = _mm_cvtepi8_epi16(v_filter);
+
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter_ext, taps_256);
+ Filter2DVertical16xH<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps_256);
+ }
+ } else { // width <= 8
+ __m128i taps[4];
+ // Use 128 bit code.
+ if (vertical_taps == 8) {
+ SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<8, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 6) {
+ SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<6, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else if (vertical_taps == 4) {
+ SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<4, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ } else { // |vertical_taps| == 2
+ SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
+ if (width == 4) {
+ Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
+ dest_stride, height, taps);
+ } else {
+ Filter2DVertical<2, /*is_compound=*/true>(
+ intermediate_result, dest, dest_stride, width, height, taps);
+ }
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ dsp->convolve[0][0][0][1] = ConvolveHorizontal_AVX2;
+ dsp->convolve[0][0][1][0] = ConvolveVertical_AVX2;
+ dsp->convolve[0][0][1][1] = Convolve2D_AVX2;
+
+ dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_AVX2;
+ dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_AVX2;
+ dsp->convolve[0][1][1][1] = ConvolveCompound2D_AVX2;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void ConvolveInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void ConvolveInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/libgav1/src/dsp/x86/convolve_avx2.h b/libgav1/src/dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000..e509bc9
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_avx2.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::convolve, see the defines below for specifics. This
+// function is not thread-safe.
+void ConvolveInit_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundHorizontal LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveVertical
+#define LIBGAV1_Dsp8bpp_ConvolveVertical LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_Convolve2D
+#define LIBGAV1_Dsp8bpp_Convolve2D LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_ConvolveCompoundVertical
+#define LIBGAV1_Dsp8bpp_ConvolveCompoundVertical LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libgav1/src/dsp/x86/convolve_sse4.cc b/libgav1/src/dsp/x86/convolve_sse4.cc
index ff9a373..9b72fe4 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.cc
+++ b/libgav1/src/dsp/x86/convolve_sse4.cc
@@ -16,7 +16,7 @@
#include "src/utils/constants.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
#include <algorithm>
@@ -34,73 +34,7 @@
namespace low_bitdepth {
namespace {
-// TODO(slavarnway): Move to common neon/sse4 file.
-int GetNumTapsInFilter(const int filter_index) {
- if (filter_index < 2) {
- // Despite the names these only use 6 taps.
- // kInterpolationFilterEightTap
- // kInterpolationFilterEightTapSmooth
- return 6;
- }
-
- if (filter_index == 2) {
- // kInterpolationFilterEightTapSharp
- return 8;
- }
-
- if (filter_index == 3) {
- // kInterpolationFilterBilinear
- return 2;
- }
-
- assert(filter_index > 3);
- // For small sizes (width/height <= 4) the large filters are replaced with 4
- // tap options.
- // If the original filters were |kInterpolationFilterEightTap| or
- // |kInterpolationFilterEightTapSharp| then it becomes
- // |kInterpolationFilterSwitchable|.
- // If it was |kInterpolationFilterEightTapSmooth| then it becomes an unnamed 4
- // tap filter.
- return 4;
-}
-
-constexpr int kIntermediateStride = kMaxSuperBlockSizeInPixels;
-constexpr int kHorizontalOffset = 3;
-constexpr int kFilterIndexShift = 6;
-
-// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
-// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
-// sum from outranging int16_t.
-template <int filter_index>
-__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
- __m128i sum;
- if (filter_index < 2) {
- // 6 taps.
- const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
- const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
- const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
- sum = _mm_add_epi16(v_madd_21, v_madd_43);
- sum = _mm_add_epi16(sum, v_madd_65);
- } else if (filter_index == 2) {
- // 8 taps.
- const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
- const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
- const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
- const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
- const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
- sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
- } else if (filter_index == 3) {
- // 2 taps.
- sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
- } else {
- // 4 taps.
- const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
- sum = _mm_add_epi16(v_madd_32, v_madd_54);
- }
- return sum;
-}
+#include "src/dsp/x86/convolve_sse4.inc"
template <int filter_index>
__m128i SumHorizontalTaps(const uint8_t* const src,
@@ -157,68 +91,7 @@
return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
}
-template <int filter_index>
-__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- const __m128i input0 = LoadLo8(&src[2]);
- const __m128i input1 = LoadLo8(&src[2 + src_stride]);
-
- if (filter_index == 3) {
- // 03 04 04 05 05 06 06 07 ....
- const __m128i input0_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 3);
- // 13 14 14 15 15 16 16 17 ....
- const __m128i input1_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 3);
- const __m128i v_src_43 = _mm_unpacklo_epi64(input0_dup, input1_dup);
- const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
- return v_sum_43;
- }
-
- // 02 03 03 04 04 05 05 06 06 07 ....
- const __m128i input0_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input0, input0), 1);
- // 12 13 13 14 14 15 15 16 16 17 ....
- const __m128i input1_dup =
- _mm_srli_si128(_mm_unpacklo_epi8(input1, input1), 1);
- // 04 05 05 06 06 07 07 08 ...
- const __m128i input0_dup_54 = _mm_srli_si128(input0_dup, 4);
- // 14 15 15 16 16 17 17 18 ...
- const __m128i input1_dup_54 = _mm_srli_si128(input1_dup, 4);
- const __m128i v_src_32 = _mm_unpacklo_epi64(input0_dup, input1_dup);
- const __m128i v_src_54 = _mm_unpacklo_epi64(input0_dup_54, input1_dup_54);
- const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
- const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
- const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
- return v_sum_5432;
-}
-
-template <int filter_index>
-__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- // Normally the Horizontal pass does the downshift in two passes:
- // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
- // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
- // requires adding the rounding offset from the skipped shift.
- constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
-
- sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
- sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
- return _mm_packus_epi16(sum, sum);
-}
-
-template <int filter_index>
-__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
- const __m128i* const v_tap) {
- const __m128i sum =
- SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
-
- return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int num_taps, int step, int filter_index, bool is_2d = false,
+template <int num_taps, int filter_index, bool is_2d = false,
bool is_compound = false>
void FilterHorizontal(const uint8_t* src, const ptrdiff_t src_stride,
void* const dest, const ptrdiff_t pred_stride,
@@ -229,7 +102,7 @@
// 4 tap filters are never used when width > 4.
if (num_taps != 4 && width > 4) {
- int y = 0;
+ int y = height;
do {
int x = 0;
do {
@@ -246,12 +119,12 @@
SimpleHorizontalTaps<filter_index>(&src[x], v_tap);
StoreLo8(&dest8[x], result);
}
- x += step;
+ x += 8;
} while (x < width);
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
return;
}
@@ -261,7 +134,7 @@
assert(num_taps <= 4);
if (num_taps <= 4) {
if (width == 4) {
- int y = 0;
+ int y = height;
do {
if (is_2d || is_compound) {
const __m128i v_sum = HorizontalTaps8To16<filter_index>(src, v_tap);
@@ -273,12 +146,13 @@
src += src_stride;
dest8 += pred_stride;
dest16 += pred_stride;
- } while (++y < height);
+ } while (--y != 0);
return;
}
if (!is_compound) {
- int y = 0;
+ int y = height;
+ if (is_2d) y -= 1;
do {
if (is_2d) {
const __m128i sum =
@@ -297,8 +171,8 @@
}
src += src_stride << 1;
- y += 2;
- } while (y < height - 1);
+ y -= 2;
+ } while (y != 0);
// The 2d filters have an odd |height| because the horizontal pass
// generates context for the vertical pass.
@@ -330,309 +204,11 @@
}
}
-template <int num_taps, bool is_2d_vertical = false>
-LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
- __m128i* v_tap) {
- if (num_taps == 8) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
- v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
- }
- } else if (num_taps == 6) {
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
- v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
- }
- } else if (num_taps == 4) {
- v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
- v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
- }
- } else { // num_taps == 2
- const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
- v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
- if (is_2d_vertical) {
- v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
- } else {
- v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
- }
- }
-}
-
-template <int num_taps, bool is_compound>
-__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
- const __m128i* const taps) {
- __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
- __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
- if (num_taps >= 4) {
- __m128i madd_lo =
- _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
- __m128i madd_hi =
- _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- if (num_taps >= 6) {
- madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
- madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- if (num_taps == 8) {
- madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
- madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
- sum_lo = _mm_add_epi32(sum_lo, madd_lo);
- sum_hi = _mm_add_epi32(sum_hi, madd_hi);
- }
- }
- }
-
- if (is_compound) {
- return _mm_packs_epi32(
- RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
- RightShiftWithRounding_S32(sum_hi,
- kInterRoundBitsCompoundVertical - 1));
- }
-
- return _mm_packs_epi32(
- RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
- RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
-}
-
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int width,
- const int height, const __m128i* const taps) {
- assert(width >= 8);
- constexpr int next_row = num_taps - 1;
- // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
- const ptrdiff_t src_stride = width;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- int x = 0;
- do {
- __m128i srcs[8];
- const uint16_t* src_x = src + x;
- srcs[0] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps >= 4) {
- srcs[1] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[2] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps >= 6) {
- srcs[3] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[4] = LoadAligned16(src_x);
- src_x += src_stride;
- if (num_taps == 8) {
- srcs[5] = LoadAligned16(src_x);
- src_x += src_stride;
- srcs[6] = LoadAligned16(src_x);
- src_x += src_stride;
- }
- }
- }
-
- int y = 0;
- do {
- srcs[next_row] = LoadAligned16(src_x);
- src_x += src_stride;
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
- if (is_compound) {
- StoreUnaligned16(dst16 + x + y * dst_stride, sum);
- } else {
- StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(sum, sum));
- }
-
- srcs[0] = srcs[1];
- if (num_taps >= 4) {
- srcs[1] = srcs[2];
- srcs[2] = srcs[3];
- if (num_taps >= 6) {
- srcs[3] = srcs[4];
- srcs[4] = srcs[5];
- if (num_taps == 8) {
- srcs[5] = srcs[6];
- srcs[6] = srcs[7];
- }
- }
- }
- } while (++y < height);
- x += 8;
- } while (x < width);
-}
-
-// Take advantage of |src_stride| == |width| to process two rows at a time.
-template <int num_taps, bool is_compound = false>
-void Filter2DVertical4xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const __m128i* const taps) {
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- __m128i srcs[9];
- srcs[0] = LoadAligned16(src);
- src += 8;
- if (num_taps >= 4) {
- srcs[2] = LoadAligned16(src);
- src += 8;
- srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
- if (num_taps >= 6) {
- srcs[4] = LoadAligned16(src);
- src += 8;
- srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
- if (num_taps == 8) {
- srcs[6] = LoadAligned16(src);
- src += 8;
- srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
- }
- }
- }
-
- int y = 0;
- do {
- srcs[num_taps] = LoadAligned16(src);
- src += 8;
- srcs[num_taps - 1] = _mm_unpacklo_epi64(
- _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
- if (is_compound) {
- StoreUnaligned16(dst16, sum);
- dst16 += 4 << 1;
- } else {
- const __m128i results = _mm_packus_epi16(sum, sum);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- if (num_taps >= 4) {
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- if (num_taps >= 6) {
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- if (num_taps == 8) {
- srcs[5] = srcs[7];
- srcs[6] = srcs[8];
- }
- }
- }
- y += 2;
- } while (y < height);
-}
-
-// Take advantage of |src_stride| == |width| to process four rows at a time.
-template <int num_taps>
-void Filter2DVertical2xH(const uint16_t* src, void* const dst,
- const ptrdiff_t dst_stride, const int height,
- const __m128i* const taps) {
- constexpr int next_row = (num_taps < 6) ? 4 : 8;
-
- auto* dst8 = static_cast<uint8_t*>(dst);
-
- __m128i srcs[9];
- srcs[0] = LoadAligned16(src);
- src += 8;
- if (num_taps >= 6) {
- srcs[4] = LoadAligned16(src);
- src += 8;
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- if (num_taps == 8) {
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- }
- }
-
- int y = 0;
- do {
- srcs[next_row] = LoadAligned16(src);
- src += 8;
- if (num_taps == 2) {
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- } else if (num_taps == 4) {
- srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- } else if (num_taps == 6) {
- srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
- srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
- srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
- } else if (num_taps == 8) {
- srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
- srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
- srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
- }
-
- const __m128i sum =
- SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
- const __m128i results = _mm_packus_epi16(sum, sum);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
- // Therefore we don't need to check this condition when |height| > 4.
- if (num_taps <= 4 && height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- if (num_taps == 6) {
- srcs[1] = srcs[5];
- srcs[4] = srcs[8];
- } else if (num_taps == 8) {
- srcs[1] = srcs[5];
- srcs[2] = srcs[6];
- srcs[3] = srcs[7];
- srcs[4] = srcs[8];
- }
-
- y += 4;
- } while (y < height);
-}
-
template <bool is_2d = false, bool is_compound = false>
LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
const uint8_t* const src, const ptrdiff_t src_stride, void* const dst,
const ptrdiff_t dst_stride, const int width, const int height,
- const int subpixel, const int filter_index) {
- const int filter_id = (subpixel >> 6) & kSubPixelMask;
+ const int filter_id, const int filter_index) {
assert(filter_id != 0);
__m128i v_tap[4];
const __m128i v_horizontal_filter =
@@ -640,37 +216,39 @@
if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_horizontal_filter, v_tap);
- FilterHorizontal<8, 8, 2, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<8, 2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 1) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 1, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 1, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 0) { // 6 tap.
SetupTaps<6>(&v_horizontal_filter, v_tap);
- FilterHorizontal<6, 8, 0, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<6, 0, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 4, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else if (filter_index == 5) { // 4 tap.
SetupTaps<4>(&v_horizontal_filter, v_tap);
- FilterHorizontal<4, 8, 5, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<4, 5, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
} else { // 2 tap.
SetupTaps<2>(&v_horizontal_filter, v_tap);
- FilterHorizontal<2, 8, 3, is_2d, is_compound>(
- src, src_stride, dst, dst_stride, width, height, v_tap);
+ FilterHorizontal<2, 3, is_2d, is_compound>(src, src_stride, dst, dst_stride,
+ width, height, v_tap);
}
}
void Convolve2D_SSE4_1(const void* const reference,
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
- const int vertical_filter_index, const int subpixel_x,
- const int subpixel_y, const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(vert_filter_index);
@@ -686,18 +264,17 @@
(vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
- width, intermediate_height, subpixel_x,
- horiz_filter_index);
+ width, intermediate_height,
+ horizontal_filter_id, horiz_filter_index);
// Vertical filter.
auto* dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
- const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
__m128i taps[4];
const __m128i v_filter =
- LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
if (vertical_taps == 8) {
SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
@@ -750,39 +327,6 @@
}
}
-// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
-// Vertical calculations.
-__m128i Compound1DShift(const __m128i sum) {
- return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
-}
-
-template <int filter_index>
-__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
- __m128i v_src[4];
-
- if (filter_index < 2) {
- // 6 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- } else if (filter_index == 2) {
- // 8 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
- v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
- } else if (filter_index == 3) {
- // 2 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- } else if (filter_index > 3) {
- // 4 taps.
- v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
- v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
- }
- const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
- return sum;
-}
-
template <int filter_index, bool is_compound = false>
void FilterVertical(const uint8_t* src, const ptrdiff_t src_stride,
void* const dst, const ptrdiff_t dst_stride,
@@ -819,7 +363,9 @@
}
}
- int y = 0;
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
do {
srcs[next_row] = LoadLo8(src_x);
src_x += src_stride;
@@ -827,11 +373,13 @@
const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
if (is_compound) {
const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16 + x + y * dst_stride, results);
+ StoreUnaligned16(dst16_x, results);
+ dst16_x += dst_stride;
} else {
const __m128i results =
RightShiftWithRounding_S16(sums, kFilterBits - 1);
- StoreLo8(dst8 + x + y * dst_stride, _mm_packus_epi16(results, results));
+ StoreLo8(dst8_x, _mm_packus_epi16(results, results));
+ dst8_x += dst_stride;
}
srcs[0] = srcs[1];
@@ -847,513 +395,19 @@
}
}
}
- } while (++y < height);
+ } while (--y != 0);
x += 8;
} while (x < width);
}
-template <int filter_index, bool is_compound = false>
-void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
- auto* dst8 = static_cast<uint8_t*>(dst);
- auto* dst16 = static_cast<uint16_t*>(dst);
-
- __m128i srcs[9];
-
- if (num_taps == 2) {
- srcs[2] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
-
- int y = 0;
- do {
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- y += 2;
- } while (y < height);
- } else if (num_taps == 4) {
- srcs[4] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
-
- int y = 0;
- do {
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- y += 2;
- } while (y < height);
- } else if (num_taps == 6) {
- srcs[6] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
-
- int y = 0;
- do {
- // 50 51 52 53
- const __m128i c = Load4(src);
- // 40 41 42 43 50 51 52 53
- srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
- src += src_stride;
- // 60 61 62 63
- srcs[6] = Load4(src);
- src += src_stride;
- // 50 51 52 53 60 61 62 63
- srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- y += 2;
- } while (y < height);
- } else if (num_taps == 8) {
- srcs[8] = _mm_setzero_si128();
- // 00 01 02 03
- srcs[0] = Load4(src);
- src += src_stride;
- // 10 11 12 13
- const __m128i a = Load4(src);
- // 00 01 02 03 10 11 12 13
- srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
- src += src_stride;
- // 20 21 22 23
- srcs[2] = Load4(src);
- src += src_stride;
- // 10 11 12 13 20 21 22 23
- srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
- // 30 31 32 33
- const __m128i b = Load4(src);
- // 20 21 22 23 30 31 32 33
- srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
- src += src_stride;
- // 40 41 42 43
- srcs[4] = Load4(src);
- src += src_stride;
- // 30 31 32 33 40 41 42 43
- srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
- // 50 51 52 53
- const __m128i c = Load4(src);
- // 40 41 42 43 50 51 52 53
- srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
- src += src_stride;
- // 60 61 62 63
- srcs[6] = Load4(src);
- src += src_stride;
- // 50 51 52 53 60 61 62 63
- srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
-
- int y = 0;
- do {
- // 70 71 72 73
- const __m128i d = Load4(src);
- // 60 61 62 63 70 71 72 73
- srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
- src += src_stride;
- // 80 81 82 83
- srcs[8] = Load4(src);
- src += src_stride;
- // 70 71 72 73 80 81 82 83
- srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
-
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- if (is_compound) {
- const __m128i results = Compound1DShift(sums);
- StoreUnaligned16(dst16, results);
- dst16 += 4 << 1;
- } else {
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
- Store4(dst8, results);
- dst8 += dst_stride;
- Store4(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- }
-
- srcs[0] = srcs[2];
- srcs[1] = srcs[3];
- srcs[2] = srcs[4];
- srcs[3] = srcs[5];
- srcs[4] = srcs[6];
- srcs[5] = srcs[7];
- srcs[6] = srcs[8];
- y += 2;
- } while (y < height);
- }
-}
-
-template <int filter_index, bool negative_outside_taps = false>
-void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
- void* const dst, const ptrdiff_t dst_stride,
- const int height, const __m128i* const v_tap) {
- const int num_taps = GetNumTapsInFilter(filter_index);
- auto* dst8 = static_cast<uint8_t*>(dst);
-
- __m128i srcs[9];
-
- if (num_taps == 2) {
- srcs[2] = _mm_setzero_si128();
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
-
- int y = 0;
- do {
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[2] = Load2<0>(src, srcs[2]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41
- const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_2, 2);
- // This uses srcs[0]..srcs[1].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- if (height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[2];
- y += 4;
- } while (y < height);
- } else if (num_taps == 4) {
- srcs[4] = _mm_setzero_si128();
-
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
-
- int y = 0;
- do {
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2<0>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4, 2);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
- // This uses srcs[0]..srcs[3].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- if (height == 2) return;
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- y += 4;
- } while (y < height);
- } else if (num_taps == 6) {
- // During the vertical pass the number of taps is restricted when
- // |height| <= 4.
- assert(height > 4);
- srcs[8] = _mm_setzero_si128();
-
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2(src);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
-
- int y = 0;
- do {
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61 70 71
- srcs[4] = Load2<3>(src, srcs[4]);
- src += src_stride;
- // 80 81
- srcs[8] = Load2<0>(src, srcs[8]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
- const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
- // 50 51 60 61 70 71 80 81
- srcs[5] = _mm_srli_si128(srcs_4_8, 2);
-
- // This uses srcs[0]..srcs[5].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- srcs[1] = srcs[5];
- srcs[4] = srcs[8];
- y += 4;
- } while (y < height);
- } else if (num_taps == 8) {
- // During the vertical pass the number of taps is restricted when
- // |height| <= 4.
- assert(height > 4);
- srcs[8] = _mm_setzero_si128();
- // 00 01
- srcs[0] = Load2(src);
- src += src_stride;
- // 00 01 10 11
- srcs[0] = Load2<1>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21
- srcs[0] = Load2<2>(src, srcs[0]);
- src += src_stride;
- // 00 01 10 11 20 21 30 31
- srcs[0] = Load2<3>(src, srcs[0]);
- src += src_stride;
- // 40 41
- srcs[4] = Load2(src);
- src += src_stride;
- // 40 41 50 51
- srcs[4] = Load2<1>(src, srcs[4]);
- src += src_stride;
- // 40 41 50 51 60 61
- srcs[4] = Load2<2>(src, srcs[4]);
- src += src_stride;
-
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
- const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
- // 10 11 20 21 30 31 40 41
- srcs[1] = _mm_srli_si128(srcs_0_4, 2);
- // 20 21 30 31 40 41 50 51
- srcs[2] = _mm_srli_si128(srcs_0_4, 4);
- // 30 31 40 41 50 51 60 61
- srcs[3] = _mm_srli_si128(srcs_0_4, 6);
-
- int y = 0;
- do {
- // 40 41 50 51 60 61 70 71
- srcs[4] = Load2<3>(src, srcs[4]);
- src += src_stride;
- // 80 81
- srcs[8] = Load2<0>(src, srcs[8]);
- src += src_stride;
- // 80 81 90 91
- srcs[8] = Load2<1>(src, srcs[8]);
- src += src_stride;
- // 80 81 90 91 a0 a1
- srcs[8] = Load2<2>(src, srcs[8]);
- src += src_stride;
-
- // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
- const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
- // 50 51 60 61 70 71 80 81
- srcs[5] = _mm_srli_si128(srcs_4_8, 2);
- // 60 61 70 71 80 81 90 91
- srcs[6] = _mm_srli_si128(srcs_4_8, 4);
- // 70 71 80 81 90 91 a0 a1
- srcs[7] = _mm_srli_si128(srcs_4_8, 6);
-
- // This uses srcs[0]..srcs[7].
- const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
- const __m128i results_16 =
- RightShiftWithRounding_S16(sums, kFilterBits - 1);
- const __m128i results = _mm_packus_epi16(results_16, results_16);
-
- Store2(dst8, results);
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 2));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 4));
- dst8 += dst_stride;
- Store2(dst8, _mm_srli_si128(results, 6));
- dst8 += dst_stride;
-
- srcs[0] = srcs[4];
- srcs[1] = srcs[5];
- srcs[2] = srcs[6];
- srcs[3] = srcs[7];
- srcs[4] = srcs[8];
- y += 4;
- } while (y < height);
- }
-}
-
void ConvolveVertical_SSE4_1(const void* const reference,
const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/,
const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y,
- const int width, const int height,
- void* prediction, const ptrdiff_t pred_stride) {
+ const int /*horizontal_filter_id*/,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
@@ -1361,19 +415,18 @@
(vertical_taps / 2 - 1) * src_stride;
auto* dest = static_cast<uint8_t*>(prediction);
const ptrdiff_t dest_stride = pred_stride;
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
__m128i taps[4];
const __m128i v_filter =
- LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
if (filter_index < 2) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<0>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<6, 0>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<0>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1381,9 +434,9 @@
} else if (filter_index == 2) { // 8 tap.
SetupTaps<8>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<8, 2>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1391,9 +444,9 @@
} else if (filter_index == 3) { // 2 tap.
SetupTaps<2>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<3>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<2, 3>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<3>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1401,9 +454,9 @@
} else if (filter_index == 4) { // 4 tap.
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4, 4>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1414,9 +467,9 @@
SetupTaps<4>(&v_filter, taps);
if (width == 2) {
- FilterVertical2xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical2xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
} else if (width == 4) {
- FilterVertical4xH<5>(src, src_stride, dest, dest_stride, height, taps);
+ FilterVertical4xH<4, 5>(src, src_stride, dest, dest_stride, height, taps);
} else {
FilterVertical<5>(src, src_stride, dest, dest_stride, width, height,
taps);
@@ -1424,11 +477,14 @@
}
}
-void ConvolveCompoundCopy_SSE4(
- const void* const reference, const ptrdiff_t reference_stride,
- const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
- const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t pred_stride) {
+void ConvolveCompoundCopy_SSE4(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/,
+ const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/,
+ const int /*vertical_filter_id*/,
+ const int width, const int height,
+ void* prediction, const ptrdiff_t pred_stride) {
const auto* src = static_cast<const uint8_t*>(reference);
const ptrdiff_t src_stride = reference_stride;
auto* dest = static_cast<uint16_t*>(prediction);
@@ -1485,26 +541,26 @@
void ConvolveCompoundVertical_SSE4_1(
const void* const reference, const ptrdiff_t reference_stride,
const int /*horizontal_filter_index*/, const int vertical_filter_index,
- const int /*subpixel_x*/, const int subpixel_y, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int /*horizontal_filter_id*/, const int vertical_filter_id,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(vertical_filter_index, height);
const int vertical_taps = GetNumTapsInFilter(filter_index);
const ptrdiff_t src_stride = reference_stride;
const auto* src = static_cast<const uint8_t*>(reference) -
(vertical_taps / 2 - 1) * src_stride;
auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = (subpixel_y >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
__m128i taps[4];
const __m128i v_filter =
- LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
+ LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
if (filter_index < 2) { // 6 tap.
SetupTaps<6>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<0, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<6, 0, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<0, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1513,8 +569,8 @@
SetupTaps<8>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<8, 2, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1523,8 +579,8 @@
SetupTaps<2>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<3, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<2, 3, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<3, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1533,8 +589,8 @@
SetupTaps<4>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, 4, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1543,8 +599,8 @@
SetupTaps<4>(&v_filter, taps);
if (width == 4) {
- FilterVertical4xH<5, /*is_compound=*/true>(src, src_stride, dest, 4,
- height, taps);
+ FilterVertical4xH<4, 5, /*is_compound=*/true>(src, src_stride, dest, 4,
+ height, taps);
} else {
FilterVertical<5, /*is_compound=*/true>(src, src_stride, dest, width,
width, height, taps);
@@ -1556,7 +612,8 @@
const ptrdiff_t reference_stride,
const int horizontal_filter_index,
const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/,
+ const int horizontal_filter_id,
+ const int /*vertical_filter_id*/,
const int width, const int height,
void* prediction, const ptrdiff_t pred_stride) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
@@ -1565,28 +622,32 @@
auto* dest = static_cast<uint8_t*>(prediction);
DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
- subpixel_x, filter_index);
+ horizontal_filter_id, filter_index);
}
void ConvolveCompoundHorizontal_SSE4_1(
const void* const reference, const ptrdiff_t reference_stride,
const int horizontal_filter_index, const int /*vertical_filter_index*/,
- const int subpixel_x, const int /*subpixel_y*/, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+ const int horizontal_filter_id, const int /*vertical_filter_id*/,
+ const int width, const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
const int filter_index = GetFilterIndex(horizontal_filter_index, width);
const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
auto* dest = static_cast<uint16_t*>(prediction);
DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
- src, reference_stride, dest, width, width, height, subpixel_x,
+ src, reference_stride, dest, width, width, height, horizontal_filter_id,
filter_index);
}
-void ConvolveCompound2D_SSE4_1(
- const void* const reference, const ptrdiff_t reference_stride,
- const int horizontal_filter_index, const int vertical_filter_index,
- const int subpixel_x, const int subpixel_y, const int width,
- const int height, void* prediction, const ptrdiff_t /*pred_stride*/) {
+void ConvolveCompound2D_SSE4_1(const void* const reference,
+ const ptrdiff_t reference_stride,
+ const int horizontal_filter_index,
+ const int vertical_filter_index,
+ const int horizontal_filter_id,
+ const int vertical_filter_id, const int width,
+ const int height, void* prediction,
+ const ptrdiff_t /*pred_stride*/) {
// The output of the horizontal filter, i.e. the intermediate_result, is
// guaranteed to fit in int16_t.
alignas(16) uint16_t
@@ -1609,17 +670,16 @@
DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
src, src_stride, intermediate_result, width, width, intermediate_height,
- subpixel_x, horiz_filter_index);
+ horizontal_filter_id, horiz_filter_index);
// Vertical filter.
auto* dest = static_cast<uint16_t*>(prediction);
- const int filter_id = ((subpixel_y & 1023) >> 6) & kSubPixelMask;
- assert(filter_id != 0);
+ assert(vertical_filter_id != 0);
const ptrdiff_t dest_stride = width;
__m128i taps[4];
const __m128i v_filter =
- LoadLo8(kHalfSubPixelFilters[vert_filter_index][filter_id]);
+ LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
if (vertical_taps == 8) {
SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
@@ -1777,7 +837,11 @@
template <int num_taps, int grade_x>
inline void PrepareSourceVectors(const uint8_t* src, const __m128i src_indices,
__m128i* const source /*[num_taps >> 1]*/) {
- const __m128i src_vals = LoadUnaligned16(src);
+ // |used_bytes| is only computed in msan builds. Mask away unused bytes for
+ // msan because it incorrectly models the outcome of the shuffles in some
+ // cases. This has not been reproduced out of context.
+ const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
+ const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
source[0] = _mm_shuffle_epi8(src_vals, src_indices);
if (grade_x == 1) {
if (num_taps > 2) {
@@ -1793,7 +857,7 @@
assert(grade_x > 1);
assert(num_taps != 4);
// grade_x > 1 also means width >= 8 && num_taps != 4
- const __m128i src_vals_ext = LoadLo8(src + 16);
+ const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
if (num_taps > 2) {
source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
src_indices);
@@ -2008,14 +1072,10 @@
// |width_class| is 2, 4, or 8, according to the Store function that should be
// used.
template <int num_taps, int width_class, bool is_compound>
-#if LIBGAV1_MSAN
-__attribute__((no_sanitize_memory)) void ConvolveVerticalScale(
-#else
-inline void ConvolveVerticalScale(
-#endif
- const int16_t* src, const int width, const int subpixel_y,
- const int filter_index, const int step_y, const int height, void* dest,
- const ptrdiff_t dest_stride) {
+inline void ConvolveVerticalScale(const int16_t* src, const int width,
+ const int subpixel_y, const int filter_index,
+ const int step_y, const int height,
+ void* dest, const ptrdiff_t dest_stride) {
constexpr ptrdiff_t src_stride = kIntermediateStride;
constexpr int kernel_offset = (8 - num_taps) / 2;
const int16_t* src_y = src;
@@ -2282,6 +1342,540 @@
}
}
+inline void HalfAddHorizontal(const uint8_t* src, uint8_t* dst) {
+ const __m128i left = LoadUnaligned16(src);
+ const __m128i right = LoadUnaligned16(src + 1);
+ StoreUnaligned16(dst, _mm_avg_epu8(left, right));
+}
+
+template <int width>
+inline void IntraBlockCopyHorizontal(const uint8_t* src,
+ const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+
+ int y = height;
+ do {
+ HalfAddHorizontal(src, dst);
+ if (width >= 32) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width >= 64) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ if (width == 128) {
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ src += 16;
+ dst += 16;
+ HalfAddHorizontal(src, dst);
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyHorizontal_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*subpixel_x*/, const int /*subpixel_y*/, const int width,
+ const int height, void* const prediction, const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ int y = height;
+ do {
+ const __m128i left = LoadLo8(src);
+ const __m128i right = LoadLo8(src + 1);
+ StoreLo8(dest, _mm_avg_epu8(left, right));
+
+ src += reference_stride;
+ dest += pred_stride;
+ } while (--y != 0);
+ } else if (width == 4) {
+ int y = height;
+ do {
+ __m128i left = Load4(src);
+ __m128i right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ } else {
+ assert(width == 2);
+ __m128i left = _mm_setzero_si128();
+ __m128i right = _mm_setzero_si128();
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<1>(src, left);
+ right = Load2<1>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i result = _mm_avg_epu8(left, right);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 2));
+ dest += pred_stride;
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int width>
+inline void IntraBlockCopyVertical(const uint8_t* src,
+ const ptrdiff_t src_stride, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
+ __m128i row[8], below[8];
+
+ row[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ row[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ row[2] = LoadUnaligned16(src);
+ src += 16;
+ row[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ row[4] = LoadUnaligned16(src);
+ src += 16;
+ row[5] = LoadUnaligned16(src);
+ src += 16;
+ row[6] = LoadUnaligned16(src);
+ src += 16;
+ row[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ below[0] = LoadUnaligned16(src);
+ if (width >= 32) {
+ src += 16;
+ below[1] = LoadUnaligned16(src);
+ if (width >= 64) {
+ src += 16;
+ below[2] = LoadUnaligned16(src);
+ src += 16;
+ below[3] = LoadUnaligned16(src);
+ if (width == 128) {
+ src += 16;
+ below[4] = LoadUnaligned16(src);
+ src += 16;
+ below[5] = LoadUnaligned16(src);
+ src += 16;
+ below[6] = LoadUnaligned16(src);
+ src += 16;
+ below[7] = LoadUnaligned16(src);
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
+ row[0] = below[0];
+ if (width >= 32) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
+ row[1] = below[1];
+ if (width >= 64) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
+ row[2] = below[2];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
+ row[3] = below[3];
+ if (width >= 128) {
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
+ row[4] = below[4];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
+ row[5] = below[5];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
+ row[6] = below[6];
+ dst += 16;
+ StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
+ row[7] = below[7];
+ }
+ }
+ }
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopyVertical_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+
+ if (width == 128) {
+ IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
+ pred_stride);
+ } else if (width == 8) {
+ __m128i row, below;
+ row = LoadLo8(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = LoadLo8(src);
+ src += reference_stride;
+
+ StoreLo8(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else if (width == 4) {
+ __m128i row = Load4(src);
+ src += reference_stride;
+
+ int y = height;
+ do {
+ __m128i below = Load4(src);
+ src += reference_stride;
+
+ Store4(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ } else {
+ assert(width == 2);
+ __m128i row = Load2(src);
+ __m128i below = _mm_setzero_si128();
+ src += reference_stride;
+
+ int y = height;
+ do {
+ below = Load2<0>(src, below);
+ src += reference_stride;
+
+ Store2(dest, _mm_avg_epu8(row, below));
+ dest += pred_stride;
+
+ row = below;
+ } while (--y != 0);
+ }
+}
+
+// Load then add two uint8_t vectors. Return the uint16_t vector result.
+inline __m128i LoadU8AndAddLong(const uint8_t* src, const uint8_t* src1) {
+ const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
+ const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
+ return _mm_add_epi16(a, b);
+}
+
+inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
+ const __m128i a = _mm_add_epi16(v0, v1);
+ const __m128i b = _mm_srli_epi16(a, 1);
+ // Use avg here to shift right by 1 with round.
+ const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
+ return _mm_packus_epi16(c, c);
+}
+
+template <int width>
+inline void IntraBlockCopy2D(const uint8_t* src, const ptrdiff_t src_stride,
+ const int height, uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
+ const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
+ __m128i row[16];
+ row[0] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 16) {
+ src += 8;
+ row[1] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 32) {
+ src += 8;
+ row[2] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[3] = LoadU8AndAddLong(src, src + 1);
+ if (width >= 64) {
+ src += 8;
+ row[4] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[5] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[6] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[7] = LoadU8AndAddLong(src, src + 1);
+ if (width == 128) {
+ src += 8;
+ row[8] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[9] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[10] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[11] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[12] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[13] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[14] = LoadU8AndAddLong(src, src + 1);
+ src += 8;
+ row[15] = LoadU8AndAddLong(src, src + 1);
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+
+ int y = height;
+ do {
+ const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
+ row[0] = below_0;
+ if (width >= 16) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
+ row[1] = below_1;
+ if (width >= 32) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
+ row[2] = below_2;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
+ row[3] = below_3;
+ if (width >= 64) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
+ row[4] = below_4;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
+ row[5] = below_5;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
+ row[6] = below_6;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
+ row[7] = below_7;
+ if (width == 128) {
+ src += 8;
+ dst += 8;
+
+ const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
+ row[8] = below_8;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
+ row[9] = below_9;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
+ row[10] = below_10;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
+ row[11] = below_11;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
+ row[12] = below_12;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
+ row[13] = below_13;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
+ row[14] = below_14;
+ src += 8;
+ dst += 8;
+
+ const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
+ StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
+ row[15] = below_15;
+ }
+ }
+ }
+ }
+ src += src_remainder_stride;
+ dst += dst_remainder_stride;
+ } while (--y != 0);
+}
+
+void ConvolveIntraBlockCopy2D_SSE4_1(
+ const void* const reference, const ptrdiff_t reference_stride,
+ const int /*horizontal_filter_index*/, const int /*vertical_filter_index*/,
+ const int /*horizontal_filter_id*/, const int /*vertical_filter_id*/,
+ const int width, const int height, void* const prediction,
+ const ptrdiff_t pred_stride) {
+ const auto* src = static_cast<const uint8_t*>(reference);
+ auto* dest = static_cast<uint8_t*>(prediction);
+ // Note: allow vertical access to height + 1. Because this function is only
+ // for u/v plane of intra block copy, such access is guaranteed to be within
+ // the prediction block.
+
+ if (width == 128) {
+ IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 64) {
+ IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 32) {
+ IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 16) {
+ IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 8) {
+ IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
+ } else if (width == 4) {
+ __m128i left = _mm_cvtepu8_epi16(Load4(src));
+ __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
+ src += reference_stride;
+
+ __m128i row = _mm_add_epi16(left, right);
+
+ int y = height;
+ do {
+ left = Load4(src);
+ right = Load4(src + 1);
+ src += reference_stride;
+ left = _mm_unpacklo_epi32(left, Load4(src));
+ right = _mm_unpacklo_epi32(right, Load4(src + 1));
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store4(dest, result);
+ dest += pred_stride;
+ Store4(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ } else {
+ __m128i left = Load2(src);
+ __m128i right = Load2(src + 1);
+ src += reference_stride;
+
+ __m128i row =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+
+ int y = height;
+ do {
+ left = Load2<0>(src, left);
+ right = Load2<0>(src + 1, right);
+ src += reference_stride;
+ left = Load2<2>(src, left);
+ right = Load2<2>(src + 1, right);
+ src += reference_stride;
+
+ const __m128i below =
+ _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
+ const __m128i result =
+ AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
+
+ Store2(dest, result);
+ dest += pred_stride;
+ Store2(dest, _mm_srli_si128(result, 4));
+ dest += pred_stride;
+
+ row = _mm_srli_si128(below, 8);
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -2294,6 +1888,10 @@
dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
+ dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
+ dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
+ dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
+
dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
}
@@ -2306,7 +1904,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -2314,4 +1912,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/convolve_sse4.h b/libgav1/src/dsp/x86/convolve_sse4.h
index e449a87..d6c3155 100644
--- a/libgav1/src/dsp/x86/convolve_sse4.h
+++ b/libgav1/src/dsp/x86/convolve_sse4.h
@@ -32,7 +32,7 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_ConvolveHorizontal
#define LIBGAV1_Dsp8bpp_ConvolveHorizontal LIBGAV1_CPU_SSE4_1
@@ -70,6 +70,6 @@
#define LIBGAV1_Dsp8bpp_ConvolveCompoundScale2D LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_CONVOLVE_SSE4_H_
diff --git a/libgav1/src/dsp/x86/convolve_sse4.inc b/libgav1/src/dsp/x86/convolve_sse4.inc
new file mode 100644
index 0000000..550d6a4
--- /dev/null
+++ b/libgav1/src/dsp/x86/convolve_sse4.inc
@@ -0,0 +1,934 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Common 128 bit functions used for sse4/avx2 convolve implementations.
+// This will be included inside an anonymous namespace on files where these are
+// necessary.
+
+#include "src/dsp/convolve.inc"
+
+// Multiply every entry in |src[]| by the corresponding entry in |taps[]| and
+// sum. The filters in |taps[]| are pre-shifted by 1. This prevents the final
+// sum from outranging int16_t.
+template <int filter_index>
+__m128i SumOnePassTaps(const __m128i* const src, const __m128i* const taps) {
+ __m128i sum;
+ if (filter_index < 2) {
+ // 6 taps.
+ const __m128i v_madd_21 = _mm_maddubs_epi16(src[0], taps[0]); // k2k1
+ const __m128i v_madd_43 = _mm_maddubs_epi16(src[1], taps[1]); // k4k3
+ const __m128i v_madd_65 = _mm_maddubs_epi16(src[2], taps[2]); // k6k5
+ sum = _mm_add_epi16(v_madd_21, v_madd_43);
+ sum = _mm_add_epi16(sum, v_madd_65);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ const __m128i v_madd_10 = _mm_maddubs_epi16(src[0], taps[0]); // k1k0
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[1], taps[1]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[2], taps[2]); // k5k4
+ const __m128i v_madd_76 = _mm_maddubs_epi16(src[3], taps[3]); // k7k6
+ const __m128i v_sum_3210 = _mm_add_epi16(v_madd_10, v_madd_32);
+ const __m128i v_sum_7654 = _mm_add_epi16(v_madd_54, v_madd_76);
+ sum = _mm_add_epi16(v_sum_7654, v_sum_3210);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ sum = _mm_maddubs_epi16(src[0], taps[0]); // k4k3
+ } else {
+ // 4 taps.
+ const __m128i v_madd_32 = _mm_maddubs_epi16(src[0], taps[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(src[1], taps[1]); // k5k4
+ sum = _mm_add_epi16(v_madd_32, v_madd_54);
+ }
+ return sum;
+}
+
+template <int filter_index>
+__m128i SumHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ const __m128i v_src = LoadHi8(LoadLo8(&src[0]), &src[src_stride]);
+
+ if (filter_index == 3) {
+ // 03 04 04 05 05 06 06 07 13 14 14 15 15 16 16 17
+ const __m128i v_src_43 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0f0e0e0d, 0x0d0c0c0b, 0x07060605, 0x05040403));
+ const __m128i v_sum_43 = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
+ return v_sum_43;
+ }
+
+ // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16
+ const __m128i v_src_32 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(0x0e0d0d0c, 0x0c0b0b0a, 0x06050504, 0x04030302));
+ // 04 05 05 06 06 07 07 xx 14 15 15 16 16 17 17 xx
+ const __m128i v_src_54 = _mm_shuffle_epi8(
+ v_src, _mm_set_epi32(static_cast<int>(0x800f0f0e), 0x0e0d0d0c,
+ static_cast<int>(0x80070706), 0x06050504));
+ const __m128i v_madd_32 = _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
+ const __m128i v_madd_54 = _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
+ const __m128i v_sum_5432 = _mm_add_epi16(v_madd_54, v_madd_32);
+ return v_sum_5432;
+}
+
+template <int filter_index>
+__m128i SimpleHorizontalTaps2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ __m128i sum = SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ // Normally the Horizontal pass does the downshift in two passes:
+ // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
+ // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
+ // requires adding the rounding offset from the skipped shift.
+ constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
+
+ sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
+ sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
+ return _mm_packus_epi16(sum, sum);
+}
+
+template <int filter_index>
+__m128i HorizontalTaps8To16_2x2(const uint8_t* src, const ptrdiff_t src_stride,
+ const __m128i* const v_tap) {
+ const __m128i sum =
+ SumHorizontalTaps2x2<filter_index>(src, src_stride, v_tap);
+
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int num_taps, bool is_2d_vertical = false>
+LIBGAV1_ALWAYS_INLINE void SetupTaps(const __m128i* const filter,
+ __m128i* v_tap) {
+ if (num_taps == 8) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x0); // k1k0
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[2] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ v_tap[3] = _mm_shufflelo_epi16(*filter, 0xff); // k7k6
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ v_tap[3] = _mm_cvtepi8_epi16(v_tap[3]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ v_tap[3] = _mm_unpacklo_epi64(v_tap[3], v_tap[3]);
+ }
+ } else if (num_taps == 6) {
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x0); // k2k1
+ v_tap[1] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ v_tap[2] = _mm_shufflelo_epi16(adjusted_filter, 0xaa); // k6k5
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ v_tap[2] = _mm_cvtepi8_epi16(v_tap[2]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ v_tap[2] = _mm_unpacklo_epi64(v_tap[2], v_tap[2]);
+ }
+ } else if (num_taps == 4) {
+ v_tap[0] = _mm_shufflelo_epi16(*filter, 0x55); // k3k2
+ v_tap[1] = _mm_shufflelo_epi16(*filter, 0xaa); // k5k4
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ v_tap[1] = _mm_cvtepi8_epi16(v_tap[1]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ v_tap[1] = _mm_unpacklo_epi64(v_tap[1], v_tap[1]);
+ }
+ } else { // num_taps == 2
+ const __m128i adjusted_filter = _mm_srli_si128(*filter, 1);
+ v_tap[0] = _mm_shufflelo_epi16(adjusted_filter, 0x55); // k4k3
+ if (is_2d_vertical) {
+ v_tap[0] = _mm_cvtepi8_epi16(v_tap[0]);
+ } else {
+ v_tap[0] = _mm_unpacklo_epi64(v_tap[0], v_tap[0]);
+ }
+ }
+}
+
+template <int num_taps, bool is_compound>
+__m128i SimpleSum2DVerticalTaps(const __m128i* const src,
+ const __m128i* const taps) {
+ __m128i sum_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[0], src[1]), taps[0]);
+ __m128i sum_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[0], src[1]), taps[0]);
+ if (num_taps >= 4) {
+ __m128i madd_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(src[2], src[3]), taps[1]);
+ __m128i madd_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(src[2], src[3]), taps[1]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps >= 6) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[4], src[5]), taps[2]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[4], src[5]), taps[2]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ if (num_taps == 8) {
+ madd_lo = _mm_madd_epi16(_mm_unpacklo_epi16(src[6], src[7]), taps[3]);
+ madd_hi = _mm_madd_epi16(_mm_unpackhi_epi16(src[6], src[7]), taps[3]);
+ sum_lo = _mm_add_epi32(sum_lo, madd_lo);
+ sum_hi = _mm_add_epi32(sum_hi, madd_hi);
+ }
+ }
+ }
+
+ if (is_compound) {
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
+ RightShiftWithRounding_S32(sum_hi,
+ kInterRoundBitsCompoundVertical - 1));
+ }
+
+ return _mm_packs_epi32(
+ RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
+ RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
+}
+
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int width,
+ const int height, const __m128i* const taps) {
+ assert(width >= 8);
+ constexpr int next_row = num_taps - 1;
+ // The Horizontal pass uses |width| as |stride| for the intermediate buffer.
+ const ptrdiff_t src_stride = width;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ int x = 0;
+ do {
+ __m128i srcs[8];
+ const uint16_t* src_x = src + x;
+ srcs[0] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 4) {
+ srcs[1] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[2] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps >= 6) {
+ srcs[3] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[4] = LoadAligned16(src_x);
+ src_x += src_stride;
+ if (num_taps == 8) {
+ srcs[5] = LoadAligned16(src_x);
+ src_x += src_stride;
+ srcs[6] = LoadAligned16(src_x);
+ src_x += src_stride;
+ }
+ }
+ }
+
+ auto* dst8_x = dst8 + x;
+ auto* dst16_x = dst16 + x;
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src_x);
+ src_x += src_stride;
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16_x, sum);
+ dst16_x += dst_stride;
+ } else {
+ StoreLo8(dst8_x, _mm_packus_epi16(sum, sum));
+ dst8_x += dst_stride;
+ }
+
+ srcs[0] = srcs[1];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[2];
+ srcs[2] = srcs[3];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[4];
+ srcs[4] = srcs[5];
+ if (num_taps == 8) {
+ srcs[5] = srcs[6];
+ srcs[6] = srcs[7];
+ }
+ }
+ }
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+// Take advantage of |src_stride| == |width| to process two rows at a time.
+template <int num_taps, bool is_compound = false>
+void Filter2DVertical4xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 4) {
+ srcs[2] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[0], 8), srcs[2]);
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[3] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[2], 8), srcs[4]);
+ if (num_taps == 8) {
+ srcs[6] = LoadAligned16(src);
+ src += 8;
+ srcs[5] = _mm_unpacklo_epi64(_mm_srli_si128(srcs[4], 8), srcs[6]);
+ }
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[num_taps] = LoadAligned16(src);
+ src += 8;
+ srcs[num_taps - 1] = _mm_unpacklo_epi64(
+ _mm_srli_si128(srcs[num_taps - 2], 8), srcs[num_taps]);
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, is_compound>(srcs, taps);
+ if (is_compound) {
+ StoreUnaligned16(dst16, sum);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results = _mm_packus_epi16(sum, sum);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ if (num_taps >= 4) {
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ if (num_taps >= 6) {
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ if (num_taps == 8) {
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ }
+ }
+ }
+ y -= 2;
+ } while (y != 0);
+}
+
+// Take advantage of |src_stride| == |width| to process four rows at a time.
+template <int num_taps>
+void Filter2DVertical2xH(const uint16_t* src, void* const dst,
+ const ptrdiff_t dst_stride, const int height,
+ const __m128i* const taps) {
+ constexpr int next_row = (num_taps < 6) ? 4 : 8;
+
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+ srcs[0] = LoadAligned16(src);
+ src += 8;
+ if (num_taps >= 6) {
+ srcs[4] = LoadAligned16(src);
+ src += 8;
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ if (num_taps == 8) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ }
+ }
+
+ int y = height;
+ do {
+ srcs[next_row] = LoadAligned16(src);
+ src += 8;
+ if (num_taps == 2) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ } else if (num_taps == 4) {
+ srcs[1] = _mm_alignr_epi8(srcs[4], srcs[0], 4);
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ } else if (num_taps == 6) {
+ srcs[2] = _mm_alignr_epi8(srcs[4], srcs[0], 8);
+ srcs[3] = _mm_alignr_epi8(srcs[4], srcs[0], 12);
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ } else if (num_taps == 8) {
+ srcs[5] = _mm_alignr_epi8(srcs[8], srcs[4], 4);
+ srcs[6] = _mm_alignr_epi8(srcs[8], srcs[4], 8);
+ srcs[7] = _mm_alignr_epi8(srcs[8], srcs[4], 12);
+ }
+
+ const __m128i sum =
+ SimpleSum2DVerticalTaps<num_taps, /*is_compound=*/false>(srcs, taps);
+ const __m128i results = _mm_packus_epi16(sum, sum);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ // When |height| <= 4 the taps are restricted to 2 and 4 tap variants.
+ // Therefore we don't need to check this condition when |height| > 4.
+ if (num_taps <= 4 && height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ if (num_taps == 6) {
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ } else if (num_taps == 8) {
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ }
+
+ y -= 4;
+ } while (y != 0);
+}
+
+// The 1D compound shift is always |kInterRoundBitsHorizontal|, even for 1D
+// Vertical calculations.
+__m128i Compound1DShift(const __m128i sum) {
+ return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
+}
+
+template <int filter_index>
+__m128i SumVerticalTaps(const __m128i* const srcs, const __m128i* const v_tap) {
+ __m128i v_src[4];
+
+ if (filter_index < 2) {
+ // 6 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ } else if (filter_index == 2) {
+ // 8 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ v_src[2] = _mm_unpacklo_epi8(srcs[4], srcs[5]);
+ v_src[3] = _mm_unpacklo_epi8(srcs[6], srcs[7]);
+ } else if (filter_index == 3) {
+ // 2 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ } else if (filter_index > 3) {
+ // 4 taps.
+ v_src[0] = _mm_unpacklo_epi8(srcs[0], srcs[1]);
+ v_src[1] = _mm_unpacklo_epi8(srcs[2], srcs[3]);
+ }
+ const __m128i sum = SumOnePassTaps<filter_index>(v_src, v_tap);
+ return sum;
+}
+
+// TODO(slavarnway): Use num_taps instead of filter_index for templates. See the
+// 2D version.
+template <int num_taps, int filter_index, bool is_compound = false>
+void FilterVertical4xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+ auto* dst16 = static_cast<uint16_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+
+ int y = height;
+ do {
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ srcs[6] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+
+ int y = height;
+ do {
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ y -= 2;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ srcs[8] = _mm_setzero_si128();
+ // 00 01 02 03
+ srcs[0] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13
+ const __m128i a = Load4(src);
+ // 00 01 02 03 10 11 12 13
+ srcs[0] = _mm_unpacklo_epi32(srcs[0], a);
+ src += src_stride;
+ // 20 21 22 23
+ srcs[2] = Load4(src);
+ src += src_stride;
+ // 10 11 12 13 20 21 22 23
+ srcs[1] = _mm_unpacklo_epi32(a, srcs[2]);
+ // 30 31 32 33
+ const __m128i b = Load4(src);
+ // 20 21 22 23 30 31 32 33
+ srcs[2] = _mm_unpacklo_epi32(srcs[2], b);
+ src += src_stride;
+ // 40 41 42 43
+ srcs[4] = Load4(src);
+ src += src_stride;
+ // 30 31 32 33 40 41 42 43
+ srcs[3] = _mm_unpacklo_epi32(b, srcs[4]);
+ // 50 51 52 53
+ const __m128i c = Load4(src);
+ // 40 41 42 43 50 51 52 53
+ srcs[4] = _mm_unpacklo_epi32(srcs[4], c);
+ src += src_stride;
+ // 60 61 62 63
+ srcs[6] = Load4(src);
+ src += src_stride;
+ // 50 51 52 53 60 61 62 63
+ srcs[5] = _mm_unpacklo_epi32(c, srcs[6]);
+
+ int y = height;
+ do {
+ // 70 71 72 73
+ const __m128i d = Load4(src);
+ // 60 61 62 63 70 71 72 73
+ srcs[6] = _mm_unpacklo_epi32(srcs[6], d);
+ src += src_stride;
+ // 80 81 82 83
+ srcs[8] = Load4(src);
+ src += src_stride;
+ // 70 71 72 73 80 81 82 83
+ srcs[7] = _mm_unpacklo_epi32(d, srcs[8]);
+
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ if (is_compound) {
+ const __m128i results = Compound1DShift(sums);
+ StoreUnaligned16(dst16, results);
+ dst16 += 4 << 1;
+ } else {
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+ Store4(dst8, results);
+ dst8 += dst_stride;
+ Store4(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ }
+
+ srcs[0] = srcs[2];
+ srcs[1] = srcs[3];
+ srcs[2] = srcs[4];
+ srcs[3] = srcs[5];
+ srcs[4] = srcs[6];
+ srcs[5] = srcs[7];
+ srcs[6] = srcs[8];
+ y -= 2;
+ } while (y != 0);
+ }
+}
+
+template <int num_taps, int filter_index, bool negative_outside_taps = false>
+void FilterVertical2xH(const uint8_t* src, const ptrdiff_t src_stride,
+ void* const dst, const ptrdiff_t dst_stride,
+ const int height, const __m128i* const v_tap) {
+ auto* dst8 = static_cast<uint8_t*>(dst);
+
+ __m128i srcs[9];
+
+ if (num_taps == 2) {
+ srcs[2] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[2] = Load2<0>(src, srcs[2]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41
+ const __m128i srcs_0_2 = _mm_unpacklo_epi64(srcs[0], srcs[2]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_2, 2);
+ // This uses srcs[0]..srcs[1].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[2];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 4) {
+ srcs[4] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+
+ int y = height;
+ do {
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2<0>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ // This uses srcs[0]..srcs[3].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ if (height == 2) return;
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 6) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4x = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4x, 2);
+
+ int y = height;
+ do {
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+
+ // This uses srcs[0]..srcs[5].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ } else if (num_taps == 8) {
+ // During the vertical pass the number of taps is restricted when
+ // |height| <= 4.
+ assert(height > 4);
+ srcs[8] = _mm_setzero_si128();
+ // 00 01
+ srcs[0] = Load2(src);
+ src += src_stride;
+ // 00 01 10 11
+ srcs[0] = Load2<1>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21
+ srcs[0] = Load2<2>(src, srcs[0]);
+ src += src_stride;
+ // 00 01 10 11 20 21 30 31
+ srcs[0] = Load2<3>(src, srcs[0]);
+ src += src_stride;
+ // 40 41
+ srcs[4] = Load2(src);
+ src += src_stride;
+ // 40 41 50 51
+ srcs[4] = Load2<1>(src, srcs[4]);
+ src += src_stride;
+ // 40 41 50 51 60 61
+ srcs[4] = Load2<2>(src, srcs[4]);
+ src += src_stride;
+
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61
+ const __m128i srcs_0_4 = _mm_unpacklo_epi64(srcs[0], srcs[4]);
+ // 10 11 20 21 30 31 40 41
+ srcs[1] = _mm_srli_si128(srcs_0_4, 2);
+ // 20 21 30 31 40 41 50 51
+ srcs[2] = _mm_srli_si128(srcs_0_4, 4);
+ // 30 31 40 41 50 51 60 61
+ srcs[3] = _mm_srli_si128(srcs_0_4, 6);
+
+ int y = height;
+ do {
+ // 40 41 50 51 60 61 70 71
+ srcs[4] = Load2<3>(src, srcs[4]);
+ src += src_stride;
+ // 80 81
+ srcs[8] = Load2<0>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91
+ srcs[8] = Load2<1>(src, srcs[8]);
+ src += src_stride;
+ // 80 81 90 91 a0 a1
+ srcs[8] = Load2<2>(src, srcs[8]);
+ src += src_stride;
+
+ // 40 41 50 51 60 61 70 71 80 81 90 91 a0 a1
+ const __m128i srcs_4_8 = _mm_unpacklo_epi64(srcs[4], srcs[8]);
+ // 50 51 60 61 70 71 80 81
+ srcs[5] = _mm_srli_si128(srcs_4_8, 2);
+ // 60 61 70 71 80 81 90 91
+ srcs[6] = _mm_srli_si128(srcs_4_8, 4);
+ // 70 71 80 81 90 91 a0 a1
+ srcs[7] = _mm_srli_si128(srcs_4_8, 6);
+
+ // This uses srcs[0]..srcs[7].
+ const __m128i sums = SumVerticalTaps<filter_index>(srcs, v_tap);
+ const __m128i results_16 =
+ RightShiftWithRounding_S16(sums, kFilterBits - 1);
+ const __m128i results = _mm_packus_epi16(results_16, results_16);
+
+ Store2(dst8, results);
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 2));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 4));
+ dst8 += dst_stride;
+ Store2(dst8, _mm_srli_si128(results, 6));
+ dst8 += dst_stride;
+
+ srcs[0] = srcs[4];
+ srcs[1] = srcs[5];
+ srcs[2] = srcs[6];
+ srcs[3] = srcs[7];
+ srcs[4] = srcs[8];
+ y -= 4;
+ } while (y != 0);
+ }
+}
diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
index 77517ee..3c29b19 100644
--- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/distance_weighted_blend.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
@@ -30,6 +30,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
constexpr int kInterPostRoundBit = 4;
@@ -212,13 +213,231 @@
}
} // namespace
+} // namespace low_bitdepth
-void DistanceWeightedBlendInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kInterPostRoundBit = 4;
+
+inline __m128i ComputeWeightedAverage8(const __m128i& pred0,
+ const __m128i& pred1,
+ const __m128i& weight0,
+ const __m128i& weight1) {
+ // This offset is a combination of round_factor and round_offset
+ // which are to be added and subtracted respectively.
+ // Here kInterPostRoundBit + 4 is considering bitdepth=10.
+ constexpr int offset =
+ (1 << ((kInterPostRoundBit + 4) - 1)) - (kCompoundOffset << 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bias = _mm_set1_epi32(offset);
+ const __m128i clip_high = _mm_set1_epi16(kMax10bppSample);
+
+ __m128i prediction0 = _mm_cvtepu16_epi32(pred0);
+ __m128i mult0 = _mm_mullo_epi32(prediction0, weight0);
+ __m128i prediction1 = _mm_cvtepu16_epi32(pred1);
+ __m128i mult1 = _mm_mullo_epi32(prediction1, weight1);
+ __m128i sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result0 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+
+ prediction0 = _mm_unpackhi_epi16(pred0, zero);
+ mult0 = _mm_mullo_epi32(prediction0, weight0);
+ prediction1 = _mm_unpackhi_epi16(pred1, zero);
+ mult1 = _mm_mullo_epi32(prediction1, weight1);
+ sum = _mm_add_epi32(mult0, mult1);
+ sum = _mm_add_epi32(sum, bias);
+ const __m128i result1 = _mm_srai_epi32(sum, kInterPostRoundBit + 4);
+ const __m128i pack = _mm_packus_epi32(result0, result1);
+
+ return _mm_min_epi16(pack, clip_high);
+}
+
+template <int height>
+inline void DistanceWeightedBlend4xH_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadLo8(pred_0);
+ const __m128i src_10 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ __m128i src_0 = LoadHi8(src_00, pred_0);
+ __m128i src_1 = LoadHi8(src_10, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+ const __m128i src_01 = LoadLo8(pred_0);
+ const __m128i src_11 = LoadLo8(pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ src_0 = LoadHi8(src_01, pred_0);
+ src_1 = LoadHi8(src_11, pred_1);
+ pred_0 += 4;
+ pred_1 += 4;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_0, src_1, weight0, weight1);
+
+ StoreLo8(dst, res0);
+ dst += dest_stride;
+ StoreHi8(dst, res0);
+ dst += dest_stride;
+ StoreLo8(dst, res1);
+ dst += dest_stride;
+ StoreHi8(dst, res1);
+ dst += dest_stride;
+ y -= 4;
+ } while (y != 0);
+}
+
+template <int height>
+inline void DistanceWeightedBlend8xH_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, void* const dest, const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ const __m128i src_00 = LoadAligned16(pred_0);
+ const __m128i src_10 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res0 =
+ ComputeWeightedAverage8(src_00, src_10, weight0, weight1);
+
+ const __m128i src_01 = LoadAligned16(pred_0);
+ const __m128i src_11 = LoadAligned16(pred_1);
+ pred_0 += 8;
+ pred_1 += 8;
+ const __m128i res1 =
+ ComputeWeightedAverage8(src_01, src_11, weight0, weight1);
+
+ StoreUnaligned16(dst, res0);
+ dst += dest_stride;
+ StoreUnaligned16(dst, res1);
+ dst += dest_stride;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void DistanceWeightedBlendLarge_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1, const uint8_t weight_0,
+ const uint8_t weight_1, const int width, const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i weight0 = _mm_set1_epi32(weight_0);
+ const __m128i weight1 = _mm_set1_epi32(weight_1);
+
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i src_0_lo = LoadAligned16(pred_0 + x);
+ const __m128i src_1_lo = LoadAligned16(pred_1 + x);
+ const __m128i res_lo =
+ ComputeWeightedAverage8(src_0_lo, src_1_lo, weight0, weight1);
+
+ const __m128i src_0_hi = LoadAligned16(pred_0 + x + 8);
+ const __m128i src_1_hi = LoadAligned16(pred_1 + x + 8);
+ const __m128i res_hi =
+ ComputeWeightedAverage8(src_0_hi, src_1_hi, weight0, weight1);
+
+ StoreUnaligned16(dst + x, res_lo);
+ x += 8;
+ StoreUnaligned16(dst + x, res_hi);
+ x += 8;
+ } while (x < width);
+ dst += dest_stride;
+ pred_0 += width;
+ pred_1 += width;
+ } while (--y != 0);
+}
+
+void DistanceWeightedBlend_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const uint8_t weight_0,
+ const uint8_t weight_1, const int width,
+ const int height, void* const dest,
+ const ptrdiff_t dest_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(*pred_0);
+ if (width == 4) {
+ if (height == 4) {
+ DistanceWeightedBlend4xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else if (height == 8) {
+ DistanceWeightedBlend4xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ } else {
+ assert(height == 16);
+ DistanceWeightedBlend4xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ }
+ return;
+ }
+
+ if (width == 8) {
+ switch (height) {
+ case 4:
+ DistanceWeightedBlend8xH_SSE4_1<4>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 8:
+ DistanceWeightedBlend8xH_SSE4_1<8>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ case 16:
+ DistanceWeightedBlend8xH_SSE4_1<16>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+ return;
+ default:
+ assert(height == 32);
+ DistanceWeightedBlend8xH_SSE4_1<32>(pred_0, pred_1, weight_0, weight_1,
+ dest, dst_stride);
+
+ return;
+ }
+ }
+
+ DistanceWeightedBlendLarge_SSE4_1(pred_0, pred_1, weight_0, weight_1, width,
+ height, dest, dst_stride);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(DistanceWeightedBlend)
+ dsp->distance_weighted_blend = DistanceWeightedBlend_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void DistanceWeightedBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -227,4 +446,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h
index 2831ded..dbb9f88 100644
--- a/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h
+++ b/libgav1/src/dsp/x86/distance_weighted_blend_sse4.h
@@ -31,11 +31,15 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_DistanceWeightedBlend
#define LIBGAV1_Dsp8bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_DistanceWeightedBlend
+#define LIBGAV1_Dsp10bpp_DistanceWeightedBlend LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_DISTANCE_WEIGHTED_BLEND_SSE4_H_
diff --git a/libgav1/src/dsp/x86/film_grain_sse4.cc b/libgav1/src/dsp/x86/film_grain_sse4.cc
new file mode 100644
index 0000000..745c1ca
--- /dev/null
+++ b/libgav1/src/dsp/x86/film_grain_sse4.cc
@@ -0,0 +1,514 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/film_grain.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+#include <smmintrin.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/film_grain_common.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/compiler_attributes.h"
+#include "src/utils/logging.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace film_grain {
+namespace {
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+// The function is overloaded for each type and bitdepth for simplicity.
+inline __m128i LoadSource(const int8_t* src) {
+ return _mm_cvtepi8_epi16(LoadLo8(src));
+}
+
+// Load 8 values from source, widening to int16_t intermediate value size.
+inline __m128i LoadSource(const uint8_t* src) {
+ return _mm_cvtepu8_epi16(LoadLo8(src));
+}
+
+inline __m128i LoadSourceMsan(const uint8_t* src, const int valid_range) {
+ return _mm_cvtepu8_epi16(LoadLo8Msan(src, 8 - valid_range));
+}
+
+// Store 8 values to dest, narrowing to uint8_t from int16_t intermediate value.
+inline void StoreUnsigned(uint8_t* dest, const __m128i data) {
+ StoreLo8(dest, _mm_packus_epi16(data, data));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// Load 8 values from source.
+inline __m128i LoadSource(const int16_t* src) { return LoadUnaligned16(src); }
+
+// Load 8 values from source.
+inline __m128i LoadSource(const uint16_t* src) { return LoadUnaligned16(src); }
+
+// Store 8 values to dest.
+inline void StoreUnsigned(uint16_t* dest, const __m128i data) {
+ StoreUnaligned16(dest, data);
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint8_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16(luma);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8(luma));
+}
+
+inline __m128i GetAverageLumaMsan(const uint8_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ const __m128i src = LoadUnaligned16Msan(luma, 16 - valid_range);
+
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(_mm_cvtepu8_epi16(src),
+ _mm_unpackhi_epi8(src, _mm_setzero_si128())),
+ 1);
+ }
+ return _mm_cvtepu8_epi16(LoadLo8Msan(luma, 8 - valid_range));
+}
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+// For BlendNoiseWithImageChromaWithCfl, only |subsampling_x| is needed.
+inline __m128i GetAverageLuma(const uint16_t* const luma, int subsampling_x) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(LoadUnaligned16(luma), LoadUnaligned16(luma + 8)), 1);
+ }
+ return LoadUnaligned16(luma);
+}
+
+inline __m128i GetAverageLumaMsan(const uint16_t* const luma, int subsampling_x,
+ int valid_range) {
+ if (subsampling_x != 0) {
+ return RightShiftWithRounding_U16(
+ _mm_hadd_epi16(
+ LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma)),
+ LoadUnaligned16Msan(luma + 8, 32 - valid_range * sizeof(*luma))),
+ 1);
+ }
+ return LoadUnaligned16Msan(luma, 16 - valid_range * sizeof(*luma));
+}
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+inline __m128i Clip3(const __m128i value, const __m128i low,
+ const __m128i high) {
+ const __m128i clipped_to_ceiling = _mm_min_epi16(high, value);
+ return _mm_max_epi16(low, clipped_to_ceiling);
+}
+
+template <int bitdepth, typename Pixel>
+inline __m128i GetScalingFactors(
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* source) {
+ alignas(16) int16_t start_vals[8];
+ if (bitdepth == 8) {
+ // TODO(petersonab): Speed this up by creating a uint16_t scaling_lut.
+ // Currently this code results in a series of movzbl.
+ for (int i = 0; i < 8; ++i) {
+ start_vals[i] = scaling_lut[source[i]];
+ }
+ return LoadAligned16(start_vals);
+ }
+ alignas(16) int16_t end_vals[8];
+ // TODO(petersonab): Precompute this into a larger table for direct lookups.
+ for (int i = 0; i < 8; ++i) {
+ const int index = source[i] >> 2;
+ start_vals[i] = scaling_lut[index];
+ end_vals[i] = scaling_lut[index + 1];
+ }
+ const __m128i start = LoadAligned16(start_vals);
+ const __m128i end = LoadAligned16(end_vals);
+ __m128i remainder = LoadSource(source);
+ remainder = _mm_srli_epi16(_mm_slli_epi16(remainder, 14), 1);
+ const __m128i delta = _mm_mulhrs_epi16(_mm_sub_epi16(end, start), remainder);
+ return _mm_add_epi16(start, delta);
+}
+
+// |scaling_shift| is in range [8,11].
+template <int bitdepth>
+inline __m128i ScaleNoise(const __m128i noise, const __m128i scaling,
+ const __m128i scaling_shift) {
+ const __m128i shifted_scale_factors = _mm_sll_epi16(scaling, scaling_shift);
+ return _mm_mulhrs_epi16(noise, shifted_scale_factors);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageLuma_SSE4_1(
+ const void* noise_image_ptr, int min_value, int max_luma, int scaling_shift,
+ int width, int height, int start_height,
+ const uint8_t scaling_lut_y[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y, void* dest_plane_y,
+ ptrdiff_t dest_stride_y) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y_row = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+ auto* out_y_row = static_cast<Pixel*>(dest_plane_y);
+ dest_stride_y /= sizeof(Pixel);
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_luma);
+ const int safe_width = width & ~7;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_width; x += 8) {
+ // TODO(b/133525232): Make 16-pixel version of loop body.
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, &in_y_row[x]);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+
+ if (x < width) {
+ Pixel luma_buffer[8];
+ // Prevent arbitrary indices from entering GetScalingFactors.
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ const int valid_range = width - x;
+ memcpy(luma_buffer, &in_y_row[x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i orig = LoadSource(&in_y_row[x]);
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut_y, luma_buffer);
+ __m128i noise = LoadSource(&(noise_image[kPlaneY][y + start_height][x]));
+
+ noise = ScaleNoise<bitdepth>(noise, scaling, derived_scaling_shift);
+ const __m128i combined = _mm_add_epi16(orig, noise);
+ StoreUnsigned(&out_y_row[x], Clip3(combined, floor, ceiling));
+ }
+ in_y_row += source_stride_y;
+ out_y_row += dest_stride_y;
+ } while (++y < height);
+ out_y_row = static_cast<Pixel*>(dest_plane_y);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+inline __m128i BlendChromaValsWithCfl(
+ const Pixel* average_luma_buffer,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const Pixel* chroma_cursor, const GrainType* noise_image_cursor,
+ const __m128i scaling_shift) {
+ const __m128i scaling =
+ GetScalingFactors<bitdepth, Pixel>(scaling_lut, average_luma_buffer);
+ const __m128i orig = LoadSource(chroma_cursor);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<bitdepth>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+template <int bitdepth, typename GrainType, typename Pixel>
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlaneWithCfl_SSE4_1(
+ const Array2D<GrainType>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const Pixel* in_y_row,
+ ptrdiff_t source_stride_y, const Pixel* in_chroma_row,
+ ptrdiff_t source_stride_chroma, Pixel* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+ alignas(16) Pixel luma_buffer[16];
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final pixel will
+ // need to be guarded from overread, even if |chroma_width| is divisible by 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+
+ // Writing to this buffer avoids the cost of doing 8 lane lookups in a row
+ // in GetScalingFactors.
+ Pixel average_luma_buffer[8];
+ assert(start_height % 2 == 0);
+ start_height >>= subsampling_y;
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ // TODO(petersonab): Consider specializing by subsampling_x. In the 444
+ // case &in_y_row[x] can be passed to GetScalingFactors directly.
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ // This section only runs if width % (8 << sub_x) != 0. It should never run
+ // on 720p and above.
+ if (x < chroma_width) {
+ // Prevent huge indices from entering GetScalingFactors due to
+ // uninitialized values. This is not a problem in 8bpp because the table
+ // is made larger than 255 values.
+ if (bitdepth > 8) {
+ memset(luma_buffer, 0, sizeof(luma_buffer));
+ }
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ StoreUnsigned(average_luma_buffer, average_luma);
+
+ const __m128i blended =
+ BlendChromaValsWithCfl<bitdepth, GrainType, Pixel>(
+ average_luma_buffer, scaling_lut, &in_chroma_row[x],
+ &(noise_image[y + start_height][x]), derived_scaling_shift);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == true.
+// This further implies that scaling_lut_u == scaling_lut_v == scaling_lut_y.
+template <int bitdepth, typename GrainType, typename Pixel>
+void BlendNoiseWithImageChromaWithCfl_SSE4_1(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ const auto* noise_image =
+ static_cast<const Array2D<GrainType>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const Pixel*>(source_plane_y);
+ source_stride_y /= sizeof(Pixel);
+
+ const auto* in_uv = static_cast<const Pixel*>(source_plane_uv);
+ source_stride_uv /= sizeof(Pixel);
+ auto* out_uv = static_cast<Pixel*>(dest_plane_uv);
+ dest_stride_uv /= sizeof(Pixel);
+ BlendChromaPlaneWithCfl_SSE4_1<bitdepth, GrainType, Pixel>(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, scaling_lut, in_y,
+ source_stride_y, in_uv, source_stride_uv, out_uv, dest_stride_uv);
+}
+
+} // namespace
+
+namespace low_bitdepth {
+namespace {
+
+// |offset| is 32x4 packed to add with the result of _mm_madd_epi16.
+inline __m128i BlendChromaValsNoCfl8bpp(
+ const uint8_t scaling_lut[kScalingLookupTableSize], const __m128i& orig,
+ const int8_t* noise_image_cursor, const __m128i& average_luma,
+ const __m128i& scaling_shift, const __m128i& offset,
+ const __m128i& weights) {
+ uint8_t merged_buffer[8];
+ const __m128i combined_lo =
+ _mm_madd_epi16(_mm_unpacklo_epi16(average_luma, orig), weights);
+ const __m128i combined_hi =
+ _mm_madd_epi16(_mm_unpackhi_epi16(average_luma, orig), weights);
+ const __m128i merged_base = _mm_packs_epi32(_mm_srai_epi32((combined_lo), 6),
+ _mm_srai_epi32((combined_hi), 6));
+
+ const __m128i merged = _mm_add_epi16(merged_base, offset);
+
+ StoreLo8(merged_buffer, _mm_packus_epi16(merged, merged));
+ const __m128i scaling =
+ GetScalingFactors<8, uint8_t>(scaling_lut, merged_buffer);
+ __m128i noise = LoadSource(noise_image_cursor);
+ noise = ScaleNoise<8>(noise, scaling, scaling_shift);
+ return _mm_add_epi16(orig, noise);
+}
+
+LIBGAV1_ALWAYS_INLINE void BlendChromaPlane8bpp_SSE4_1(
+ const Array2D<int8_t>& noise_image, int min_value, int max_chroma,
+ int width, int height, int start_height, int subsampling_x,
+ int subsampling_y, int scaling_shift, int chroma_offset,
+ int chroma_multiplier, int luma_multiplier,
+ const uint8_t scaling_lut[kScalingLookupTableSize], const uint8_t* in_y_row,
+ ptrdiff_t source_stride_y, const uint8_t* in_chroma_row,
+ ptrdiff_t source_stride_chroma, uint8_t* out_chroma_row,
+ ptrdiff_t dest_stride) {
+ const __m128i floor = _mm_set1_epi16(min_value);
+ const __m128i ceiling = _mm_set1_epi16(max_chroma);
+
+ const int chroma_height = (height + subsampling_y) >> subsampling_y;
+ const int chroma_width = (width + subsampling_x) >> subsampling_x;
+ // |chroma_width| is rounded up. If |width| is odd, then the final luma pixel
+ // will need to be guarded from overread, even if |chroma_width| is a
+ // multiple of 8.
+ const int safe_chroma_width = (chroma_width - (width & 1)) & ~7;
+ alignas(16) uint8_t luma_buffer[16];
+ const __m128i offset = _mm_set1_epi16(chroma_offset);
+ const __m128i multipliers = _mm_set1_epi32(LeftShift(chroma_multiplier, 16) |
+ (luma_multiplier & 0xFFFF));
+ const __m128i derived_scaling_shift = _mm_cvtsi32_si128(15 - scaling_shift);
+
+ start_height >>= subsampling_y;
+ int y = 0;
+ do {
+ int x = 0;
+ for (; x < safe_chroma_width; x += 8) {
+ const int luma_x = x << subsampling_x;
+ const __m128i average_luma =
+ GetAverageLuma(&in_y_row[luma_x], subsampling_x);
+ const __m128i orig_chroma = LoadSource(&in_chroma_row[x]);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ }
+
+ if (x < chroma_width) {
+ // Begin right edge iteration. Same as the normal iterations, but the
+ // |average_luma| computation requires a duplicated luma value at the
+ // end.
+ const int luma_x = x << subsampling_x;
+ const int valid_range = width - luma_x;
+ assert(valid_range < 16);
+ // There is no need to pre-initialize this buffer, because merged values
+ // used as indices are saturated in the 8bpp case. Uninitialized values
+ // are written outside the frame.
+ memcpy(luma_buffer, &in_y_row[luma_x], valid_range * sizeof(in_y_row[0]));
+ luma_buffer[valid_range] = in_y_row[width - 1];
+ const int valid_range_chroma = chroma_width - x;
+ uint8_t chroma_buffer[8];
+ memcpy(chroma_buffer, &in_chroma_row[x],
+ valid_range_chroma * sizeof(in_chroma_row[0]));
+
+ const __m128i average_luma =
+ GetAverageLumaMsan(luma_buffer, subsampling_x, valid_range + 1);
+ const __m128i orig_chroma =
+ LoadSourceMsan(chroma_buffer, valid_range_chroma);
+ const __m128i blended = BlendChromaValsNoCfl8bpp(
+ scaling_lut, orig_chroma, &(noise_image[y + start_height][x]),
+ average_luma, derived_scaling_shift, offset, multipliers);
+ StoreUnsigned(&out_chroma_row[x], Clip3(blended, floor, ceiling));
+ // End of right edge iteration.
+ }
+
+ in_y_row += source_stride_y << subsampling_y;
+ in_chroma_row += source_stride_chroma;
+ out_chroma_row += dest_stride;
+ } while (++y < chroma_height);
+}
+
+// This function is for the case params_.chroma_scaling_from_luma == false.
+void BlendNoiseWithImageChroma8bpp_SSE4_1(
+ Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
+ int min_value, int max_chroma, int width, int height, int start_height,
+ int subsampling_x, int subsampling_y,
+ const uint8_t scaling_lut[kScalingLookupTableSize],
+ const void* source_plane_y, ptrdiff_t source_stride_y,
+ const void* source_plane_uv, ptrdiff_t source_stride_uv,
+ void* dest_plane_uv, ptrdiff_t dest_stride_uv) {
+ assert(plane == kPlaneU || plane == kPlaneV);
+ const auto* noise_image =
+ static_cast<const Array2D<int8_t>*>(noise_image_ptr);
+ const auto* in_y = static_cast<const uint8_t*>(source_plane_y);
+ const auto* in_uv = static_cast<const uint8_t*>(source_plane_uv);
+ auto* out_uv = static_cast<uint8_t*>(dest_plane_uv);
+
+ const int offset = (plane == kPlaneU) ? params.u_offset : params.v_offset;
+ const int luma_multiplier =
+ (plane == kPlaneU) ? params.u_luma_multiplier : params.v_luma_multiplier;
+ const int multiplier =
+ (plane == kPlaneU) ? params.u_multiplier : params.v_multiplier;
+ BlendChromaPlane8bpp_SSE4_1(
+ noise_image[plane], min_value, max_chroma, width, height, start_height,
+ subsampling_x, subsampling_y, params.chroma_scaling, offset, multiplier,
+ luma_multiplier, scaling_lut, in_y, source_stride_y, in_uv,
+ source_stride_uv, out_uv, dest_stride_uv);
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<8, int8_t, uint8_t>;
+ dsp->film_grain.blend_noise_chroma[0] = BlendNoiseWithImageChroma8bpp_SSE4_1;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<8, int8_t, uint8_t>;
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+ dsp->film_grain.blend_noise_luma =
+ BlendNoiseWithImageLuma_SSE4_1<10, int16_t, uint16_t>;
+ dsp->film_grain.blend_noise_chroma[1] =
+ BlendNoiseWithImageChromaWithCfl_SSE4_1<10, int16_t, uint16_t>;
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+} // namespace film_grain
+
+void FilmGrainInit_SSE4_1() {
+ film_grain::low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ film_grain::high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_ENABLE_SSE4_1
+
+namespace libgav1 {
+namespace dsp {
+
+void FilmGrainInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/film_grain_sse4.h b/libgav1/src/dsp/x86/film_grain_sse4.h
new file mode 100644
index 0000000..1cacbac
--- /dev/null
+++ b/libgav1/src/dsp/x86/film_grain_sse4.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initialize members of Dsp::film_grain. This function is not thread-safe.
+void FilmGrainInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+#if LIBGAV1_TARGETING_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseLuma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChroma LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp8bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#define LIBGAV1_Dsp10bpp_FilmGrainBlendNoiseChromaWithCfl LIBGAV1_DSP_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_FILM_GRAIN_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intra_edge_sse4.cc b/libgav1/src/dsp/x86/intra_edge_sse4.cc
index 3635ee1..d6af907 100644
--- a/libgav1/src/dsp/x86/intra_edge_sse4.cc
+++ b/libgav1/src/dsp/x86/intra_edge_sse4.cc
@@ -15,14 +15,14 @@
#include "src/dsp/intra_edge.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
@@ -259,7 +259,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -267,4 +267,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intra_edge_sse4.h b/libgav1/src/dsp/x86/intra_edge_sse4.h
index d6c926e..6ed4d40 100644
--- a/libgav1/src/dsp/x86/intra_edge_sse4.h
+++ b/libgav1/src/dsp/x86/intra_edge_sse4.h
@@ -32,7 +32,7 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_IntraEdgeFilter
#define LIBGAV1_Dsp8bpp_IntraEdgeFilter LIBGAV1_CPU_SSE4_1
#endif
@@ -41,6 +41,6 @@
#define LIBGAV1_Dsp8bpp_IntraEdgeUpsampler LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_INTRA_EDGE_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc b/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc
index ddf3a95..f2dcfdb 100644
--- a/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_cfl_sse4.cc
@@ -12,10 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_cfl.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -29,9 +29,48 @@
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
+namespace {
+
+// This duplicates the last two 16-bit values in |row|.
+inline __m128i LastRowSamples(const __m128i row) {
+ return _mm_shuffle_epi32(row, 0xFF);
+}
+
+// This duplicates the last 16-bit value in |row|.
+inline __m128i LastRowResult(const __m128i row) {
+ const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
+ return _mm_shuffle_epi32(dup_row, 0xFF);
+}
+
+// Takes in two sums of input row pairs, and completes the computation for two
+// output rows.
+inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreLo8(luma_ptr, result);
+ StoreHi8(luma_ptr + kCflLumaBufferStride, result);
+ return result;
+}
+
+// Takes two halves of a vertically added pair of rows and completes the
+// computation for one output row.
+inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
+ const __m128i vertical_sum1,
+ int16_t* luma_ptr) {
+ __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
+ result = _mm_slli_epi16(result, 1);
+ StoreUnaligned16(luma_ptr, result);
+ return result;
+}
+
+} // namespace
+
namespace low_bitdepth {
namespace {
@@ -40,8 +79,8 @@
inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
__m128i alpha_sign, __m128i dc_q0) {
- __m128i ac_q3 = LoadUnaligned16(input);
- __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
__m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
return _mm_add_epi16(scaled_luma_q0, dc_q0);
@@ -88,8 +127,7 @@
template <int block_height_log2, bool is_inside>
void CflSubsampler444_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
- const int /*max_luma_width*/, const int max_luma_height,
- const void* const source, ptrdiff_t stride) {
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
static_assert(block_height_log2 <= 4, "");
const int block_height = 1 << block_height_log2;
const int visible_height = max_luma_height;
@@ -119,12 +157,15 @@
} while (y < visible_height);
if (!is_inside) {
- int y = visible_height;
+ // Replicate the 2 high lanes.
+ samples = _mm_shuffle_epi32(samples, 0xee);
do {
+ StoreLo8(luma_ptr, samples);
+ luma_ptr += kCflLumaBufferStride;
StoreHi8(luma_ptr, samples);
luma_ptr += kCflLumaBufferStride;
sum = _mm_add_epi16(sum, samples);
- ++y;
+ y += 2;
} while (y < block_height);
}
@@ -152,15 +193,15 @@
static_assert(block_height_log2 <= 4, "");
assert(max_luma_width >= 4);
assert(max_luma_height >= 4);
- const int block_height = 1 << block_height_log2;
- const int block_width = 4;
+ static_cast<void>(max_luma_width);
+ constexpr int block_height = 1 << block_height_log2;
- if (block_height <= max_luma_height && block_width <= max_luma_width) {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(
- luma, max_luma_width, max_luma_height, source, stride);
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
} else {
- CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(
- luma, max_luma_width, max_luma_height, source, stride);
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
}
}
@@ -302,19 +343,9 @@
__m128i inner_sum_lo, inner_sum_hi;
int y = 0;
do {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 0; x < 16; x++) {
- const int x_index = std::min(x, visible_width_16 - 1);
- c_arr[x] = src[x_index] << 3;
- }
- samples0 = LoadUnaligned16(c_arr);
- samples1 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_16);
-#else
- __m128i samples01 = LoadUnaligned16(src);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples01 = LoadUnaligned16Msan(src, invisible_width_16);
if (!inside) {
const __m128i border16 =
@@ -323,26 +354,15 @@
}
samples0 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples01), 3);
samples1 = _mm_slli_epi16(_mm_unpackhi_epi8(samples01, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr, samples0);
StoreUnaligned16(luma_ptr + 8, samples1);
__m128i inner_sum = _mm_add_epi16(samples0, samples1);
if (block_width == 32) {
-#if LIBGAV1_MSAN // We can load uninitialized values here. Even though they are
- // then masked off by blendv, MSAN isn't smart enough to
- // understand that. So we switch to a C implementation here.
- uint16_t c_arr[16];
- for (int x = 16; x < 32; x++) {
- const int x_index = std::min(x, visible_width_32 - 1);
- c_arr[x - 16] = src[x_index] << 3;
- }
- samples2 = LoadUnaligned16(c_arr);
- samples3 = LoadUnaligned16(c_arr + 8);
- static_cast<void>(blend_mask_32);
-#else
- __m128i samples23 = LoadUnaligned16(src + 16);
+ // We can load uninitialized values here. Even though they are then masked
+ // off by blendv, MSAN doesn't model that behavior.
+ __m128i samples23 = LoadUnaligned16Msan(src + 16, invisible_width_32);
if (!inside) {
const __m128i border32 =
_mm_set1_epi8(static_cast<int8_t>(src[visible_width_32 - 1]));
@@ -350,7 +370,6 @@
}
samples2 = _mm_slli_epi16(_mm_cvtepu8_epi16(samples23), 3);
samples3 = _mm_slli_epi16(_mm_unpackhi_epi8(samples23, zero), 3);
-#endif // LIBGAV1_MSAN
StoreUnaligned16(luma_ptr + 16, samples2);
StoreUnaligned16(luma_ptr + 24, samples3);
@@ -418,29 +437,6 @@
}
}
-// Takes in two sums of input row pairs, and completes the computation for two
-// output rows.
-inline __m128i StoreLumaResults4_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreLo8(luma_ptr, result);
- StoreHi8(luma_ptr + kCflLumaBufferStride, result);
- return result;
-}
-
-// Takes two halves of a vertically added pair of rows and completes the
-// computation for one output row.
-inline __m128i StoreLumaResults8_420(const __m128i vertical_sum0,
- const __m128i vertical_sum1,
- int16_t* luma_ptr) {
- __m128i result = _mm_hadd_epi16(vertical_sum0, vertical_sum1);
- result = _mm_slli_epi16(result, 1);
- StoreUnaligned16(luma_ptr, result);
- return result;
-}
-
template <int block_height_log2>
void CflSubsampler420_4xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -511,17 +507,6 @@
}
}
-// This duplicates the last two 16-bit values in |row|.
-inline __m128i LastRowSamples(const __m128i row) {
- return _mm_shuffle_epi32(row, 0xFF);
-}
-
-// This duplicates the last 16-bit value in |row|.
-inline __m128i LastRowResult(const __m128i row) {
- const __m128i dup_row = _mm_shufflehi_epi16(row, 0xFF);
- return _mm_shuffle_epi32(dup_row, 0xFF);
-}
-
template <int block_height_log2, int max_luma_width>
inline void CflSubsampler420Impl_8xH_SSE4_1(
int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
@@ -655,10 +640,11 @@
__m128i final_sum = zero;
const int block_height = 1 << block_height_log2;
const int luma_height = std::min(block_height, max_luma_height >> 1);
+ static_assert(max_luma_width <= 32, "");
int16_t* luma_ptr = luma[0];
__m128i final_row_result;
- // Begin first y section, covering width up to 16.
+ // Begin first y section, covering width up to 32.
int y = 0;
do {
const uint8_t* src_next = src + stride;
@@ -694,29 +680,32 @@
final_row_result =
StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
sum = _mm_add_epi16(sum, final_row_result);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ sum = _mm_add_epi16(sum, wide_fill);
+ sum = _mm_add_epi16(sum, wide_fill);
+ }
final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
src += stride << 1;
luma_ptr += kCflLumaBufferStride;
} while (++y < luma_height);
- // Because max_luma_width is at most 32, any values beyond x=16 will
- // necessarily be duplicated.
- if (block_width_log2 == 5) {
- const __m128i wide_fill = LastRowResult(final_row_result);
- // Multiply duplicated value by number of occurrences, height * 4, since
- // there are 16 in each row and the value appears in the vector 4 times.
- final_sum = _mm_add_epi32(
- final_sum,
- _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), block_height_log2 + 2));
- }
-
// Begin second y section.
if (y < block_height) {
const __m128i final_fill0 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
const __m128i final_fill1 =
LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+
const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
@@ -726,6 +715,9 @@
do {
StoreUnaligned16(luma_ptr, final_fill0);
StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
luma_ptr += kCflLumaBufferStride;
final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
@@ -747,14 +739,10 @@
const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
final_row_result = _mm_sub_epi16(samples1, averages);
StoreUnaligned16(luma_ptr + 8, final_row_result);
- }
- if (block_width_log2 == 5) {
- int16_t* wide_luma_ptr = luma[0] + 16;
- const __m128i wide_fill = LastRowResult(final_row_result);
- for (int i = 0; i < block_height;
- ++i, wide_luma_ptr += kCflLumaBufferStride) {
- StoreUnaligned16(wide_luma_ptr, wide_fill);
- StoreUnaligned16(wide_luma_ptr + 8, wide_fill);
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
}
}
}
@@ -958,12 +946,887 @@
} // namespace
} // namespace low_bitdepth
-void IntraPredCflInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// CflIntraPredictor_10bpp_SSE4_1
+
+inline __m128i CflPredictUnclipped(const __m128i* input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ const __m128i ac_q3 = LoadUnaligned16(input);
+ const __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+inline __m128i ClipEpi16(__m128i x, __m128i min, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(x, max), min);
+}
+
+template <int width, int height>
+void CflIntraPredictor_10bpp_SSE4_1(
+ void* const dest, ptrdiff_t stride,
+ const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int alpha) {
+ constexpr int kCflLumaBufferStrideLog2_16i = 5;
+ constexpr int kCflLumaBufferStrideLog2_128i =
+ kCflLumaBufferStrideLog2_16i - 3;
+ constexpr int kRowIncr = 1 << kCflLumaBufferStrideLog2_128i;
+ auto* dst = static_cast<uint16_t*>(dest);
+ const __m128i alpha_sign = _mm_set1_epi16(alpha);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ auto* row = reinterpret_cast<const __m128i*>(luma);
+ const __m128i* row_end = row + (height << kCflLumaBufferStrideLog2_128i);
+ const __m128i dc_val = _mm_set1_epi16(dst[0]);
+ const __m128i min = _mm_setzero_si128();
+ const __m128i max = _mm_set1_epi16((1 << kBitdepth10) - 1);
+
+ stride >>= 1;
+
+ do {
+ __m128i res = CflPredictUnclipped(row, alpha_q12, alpha_sign, dc_val);
+ res = ClipEpi16(res, min, max);
+ if (width == 4) {
+ StoreLo8(dst, res);
+ } else if (width == 8) {
+ StoreUnaligned16(dst, res);
+ } else if (width == 16) {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ } else {
+ StoreUnaligned16(dst, res);
+ const __m128i res_1 =
+ CflPredictUnclipped(row + 1, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 8, ClipEpi16(res_1, min, max));
+ const __m128i res_2 =
+ CflPredictUnclipped(row + 2, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 16, ClipEpi16(res_2, min, max));
+ const __m128i res_3 =
+ CflPredictUnclipped(row + 3, alpha_q12, alpha_sign, dc_val);
+ StoreUnaligned16(dst + 24, ClipEpi16(res_3, min, max));
+ }
+
+ dst += stride;
+ } while ((row += kRowIncr) < row_end);
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ static_assert(block_height_log2 <= 4, "");
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadHi8(LoadLo8(src), src + src_stride);
+ src += src_stride << 1;
+ sum = _mm_add_epi16(sum, samples);
+ y -= 2;
+ } while (y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ samples = _mm_unpackhi_epi64(samples, samples);
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ y += 2;
+ } while (y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift ((log2 of width 4) + 1).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2 - 1);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadLo8(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 4, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+
+ if (block_height <= max_luma_height) {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_4xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_height_log2, bool is_inside>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum = zero;
+ __m128i samples;
+ int y = visible_height;
+
+ do {
+ samples = LoadUnaligned16(src);
+ src += src_stride;
+ sum = _mm_add_epi16(sum, samples);
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ do {
+ sum = _mm_add_epi16(sum, samples);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(_mm_unpackhi_epi16(sum, zero), _mm_cvtepu16_epi32(sum));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is nullified in right
+ // shift (log2 of width 8).
+ __m128i averages = RightShiftWithRounding_U32(sum, block_height_log2);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ samples = LoadUnaligned16(src);
+ samples = _mm_slli_epi16(samples, 3);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler444_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_cast<void>(max_luma_width);
+ static_cast<void>(max_luma_height);
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+ const int block_height = 1 << block_height_log2;
+ const int block_width = 8;
+
+ const int horz_inside = block_width <= max_luma_width;
+ const int vert_inside = block_height <= max_luma_height;
+ if (horz_inside && vert_inside) {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, true>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler444_8xH_SSE4_1<block_height_log2, false>(luma, max_luma_height,
+ source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, bool is_inside>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const int visible_height = max_luma_height;
+ const int block_width = 1 << block_width_log2;
+ const __m128i dup16 = _mm_set1_epi32(0x01000100);
+ const __m128i zero = _mm_setzero_si128();
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ __m128i sum = zero;
+ __m128i inner_sum_lo, inner_sum_hi;
+ __m128i samples[4];
+ int y = visible_height;
+
+ do {
+ samples[0] = LoadUnaligned16(src);
+ samples[1] = (max_luma_width >= 16) ? LoadUnaligned16(src + 8)
+ : LastRowResult(samples[0]);
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ samples[2] = (max_luma_width >= 24) ? LoadUnaligned16(src + 16)
+ : LastRowResult(samples[1]);
+ samples[3] = (max_luma_width == 32) ? LoadUnaligned16(src + 24)
+ : LastRowResult(samples[2]);
+
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ src += src_stride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ __m128i inner_sum = _mm_add_epi16(samples[0], samples[1]);
+ if (block_width == 32) {
+ inner_sum = _mm_add_epi16(samples[2], inner_sum);
+ inner_sum = _mm_add_epi16(samples[3], inner_sum);
+ }
+ inner_sum_lo = _mm_cvtepu16_epi32(inner_sum);
+ inner_sum_hi = _mm_unpackhi_epi16(inner_sum, zero);
+ do {
+ sum = _mm_add_epi32(sum, inner_sum_lo);
+ sum = _mm_add_epi32(sum, inner_sum_hi);
+ } while (++y < block_height);
+ }
+
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
+ sum = _mm_add_epi32(sum, _mm_srli_si128(sum, 4));
+
+ // Here the left shift by 3 (to increase precision) is subtracted in right
+ // shift factor (block_width_log2 + block_height_log2 - 3).
+ __m128i averages =
+ RightShiftWithRounding_U32(sum, block_width_log2 + block_height_log2 - 3);
+ averages = _mm_shuffle_epi8(averages, dup16);
+
+ src = static_cast<const uint16_t*>(source);
+ __m128i samples_ext = zero;
+ luma_ptr = luma[0];
+ y = visible_height;
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ if (max_luma_width > x) {
+ samples[idx] = LoadUnaligned16(&src[x]);
+ samples[idx] = _mm_slli_epi16(samples[idx], 3);
+ samples_ext = samples[idx];
+ } else {
+ samples[idx] = LastRowResult(samples_ext);
+ }
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ src += src_stride;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ if (!is_inside) {
+ y = visible_height;
+ // Replicate last line
+ do {
+ int idx = 0;
+ for (int x = 0; x < block_width; x += 8) {
+ StoreUnaligned16(&luma_ptr[x], _mm_sub_epi16(samples[idx++], averages));
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (++y < block_height);
+ }
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler444_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ static_assert(block_width_log2 == 4 || block_width_log2 == 5,
+ "This function will only work for block_width 16 and 32.");
+ static_assert(block_height_log2 <= 5, "");
+ assert(max_luma_width >= 4);
+ assert(max_luma_height >= 4);
+
+ const int block_height = 1 << block_height_log2;
+ const int vert_inside = block_height <= max_luma_height;
+ if (vert_inside) {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, true>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ } else {
+ CflSubsampler444_WxH_SSE4_1<block_width_log2, block_height_log2, false>(
+ luma, max_luma_width, max_luma_height, source, stride);
+ }
+}
+
+template <int block_height_log2>
+void CflSubsampler420_4xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int /*max_luma_width*/, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ int16_t* luma_ptr = luma[0];
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row0 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row1 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row0, samples_row1);
+
+ const __m128i samples_row2 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row3 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum23 = _mm_add_epi16(samples_row2, samples_row3);
+ __m128i sum = StoreLumaResults4_420(luma_sum01, luma_sum23, luma_ptr);
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ const __m128i samples_row4 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row5 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum45 = _mm_add_epi16(samples_row4, samples_row5);
+
+ const __m128i samples_row6 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i samples_row7 = LoadUnaligned16(src);
+ src += src_stride;
+ const __m128i luma_sum67 = _mm_add_epi16(samples_row6, samples_row7);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults4_420(luma_sum45, luma_sum67, luma_ptr));
+ luma_ptr += kCflLumaBufferStride << 1;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ const __m128i final_fill = LoadLo8(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum = _mm_cvtepu16_epi32(final_fill);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreLo8(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_U32(
+ final_sum, block_height_log2 + 2 /*log2 of width 4*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadLo8(luma_ptr);
+ StoreLo8(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const int block_height = 1 << block_height_log2;
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ int16_t* luma_ptr = luma[0];
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int y = luma_height;
+
+ do {
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ src += src_stride;
+ const __m128i samples_row10 = LoadUnaligned16(src);
+ const __m128i samples_row11 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row10);
+ src += src_stride;
+ const __m128i luma_sum00 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum01 = _mm_add_epi16(samples_row01, samples_row11);
+ __m128i sum = StoreLumaResults8_420(luma_sum00, luma_sum01, luma_ptr);
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row20 = LoadUnaligned16(src);
+ const __m128i samples_row21 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row20);
+ src += src_stride;
+ const __m128i samples_row30 = LoadUnaligned16(src);
+ const __m128i samples_row31 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row30);
+ src += src_stride;
+ const __m128i luma_sum10 = _mm_add_epi16(samples_row20, samples_row30);
+ const __m128i luma_sum11 = _mm_add_epi16(samples_row21, samples_row31);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum10, luma_sum11, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row40 = LoadUnaligned16(src);
+ const __m128i samples_row41 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row40);
+ src += src_stride;
+ const __m128i samples_row50 = LoadUnaligned16(src);
+ const __m128i samples_row51 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row50);
+ src += src_stride;
+ const __m128i luma_sum20 = _mm_add_epi16(samples_row40, samples_row50);
+ const __m128i luma_sum21 = _mm_add_epi16(samples_row41, samples_row51);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum20, luma_sum21, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ const __m128i samples_row60 = LoadUnaligned16(src);
+ const __m128i samples_row61 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row60);
+ src += src_stride;
+ const __m128i samples_row70 = LoadUnaligned16(src);
+ const __m128i samples_row71 = (max_luma_width == 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row70);
+ src += src_stride;
+ const __m128i luma_sum30 = _mm_add_epi16(samples_row60, samples_row70);
+ const __m128i luma_sum31 = _mm_add_epi16(samples_row61, samples_row71);
+ sum = _mm_add_epi16(
+ sum, StoreLumaResults8_420(luma_sum30, luma_sum31, luma_ptr));
+ luma_ptr += kCflLumaBufferStride;
+
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+ y -= 4;
+ } while (y != 0);
+
+ // Duplicate the final row downward to the end after max_luma_height.
+ const __m128i final_fill = LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill_to_sum0 = _mm_cvtepi16_epi32(final_fill);
+ const __m128i final_fill_to_sum1 =
+ _mm_cvtepi16_epi32(_mm_srli_si128(final_fill, 8));
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_fill_to_sum0, final_fill_to_sum1);
+ for (y = luma_height; y < block_height; ++y) {
+ StoreUnaligned16(luma_ptr, final_fill);
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ }
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_height_log2 + 3 /*log2 of width 8*/);
+
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples, averages));
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_height_log2>
+void CflSubsampler420_8xH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ if (max_luma_width == 8) {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 8>(luma, max_luma_height,
+ source, stride);
+ } else {
+ CflSubsampler420Impl_8xH_SSE4_1<block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ }
+}
+
+template <int block_width_log2, int block_height_log2, int max_luma_width>
+inline void CflSubsampler420Impl_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_height, const void* const source, ptrdiff_t stride) {
+ const auto* src = static_cast<const uint16_t*>(source);
+ const ptrdiff_t src_stride = stride / sizeof(src[0]);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i final_sum = zero;
+ const int block_height = 1 << block_height_log2;
+ const int luma_height = std::min(block_height, max_luma_height >> 1);
+ int16_t* luma_ptr = luma[0];
+ __m128i final_row_result;
+ // Begin first y section, covering width up to 32.
+ int y = luma_height;
+
+ do {
+ const uint16_t* src_next = src + src_stride;
+ const __m128i samples_row00 = LoadUnaligned16(src);
+ const __m128i samples_row01 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src + 8)
+ : LastRowSamples(samples_row00);
+ const __m128i samples_row02 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src + 16)
+ : LastRowSamples(samples_row01);
+ const __m128i samples_row03 = (max_luma_width == 32)
+ ? LoadUnaligned16(src + 24)
+ : LastRowSamples(samples_row02);
+ const __m128i samples_row10 = LoadUnaligned16(src_next);
+ const __m128i samples_row11 = (max_luma_width >= 16)
+ ? LoadUnaligned16(src_next + 8)
+ : LastRowSamples(samples_row10);
+ const __m128i samples_row12 = (max_luma_width >= 24)
+ ? LoadUnaligned16(src_next + 16)
+ : LastRowSamples(samples_row11);
+ const __m128i samples_row13 = (max_luma_width == 32)
+ ? LoadUnaligned16(src_next + 24)
+ : LastRowSamples(samples_row12);
+ const __m128i luma_sum0 = _mm_add_epi16(samples_row00, samples_row10);
+ const __m128i luma_sum1 = _mm_add_epi16(samples_row01, samples_row11);
+ const __m128i luma_sum2 = _mm_add_epi16(samples_row02, samples_row12);
+ const __m128i luma_sum3 = _mm_add_epi16(samples_row03, samples_row13);
+ __m128i sum = StoreLumaResults8_420(luma_sum0, luma_sum1, luma_ptr);
+ final_row_result =
+ StoreLumaResults8_420(luma_sum2, luma_sum3, luma_ptr + 8);
+ sum = _mm_add_epi16(sum, final_row_result);
+ final_sum = _mm_add_epi32(final_sum, _mm_cvtepu16_epi32(sum));
+ final_sum = _mm_add_epi32(final_sum, _mm_unpackhi_epi16(sum, zero));
+
+ // Because max_luma_width is at most 32, any values beyond x=16 will
+ // necessarily be duplicated.
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ final_sum = _mm_add_epi32(
+ final_sum, _mm_slli_epi32(_mm_cvtepi16_epi32(wide_fill), 2));
+ }
+ src += src_stride << 1;
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+
+ // Begin second y section.
+ y = luma_height;
+ if (y < block_height) {
+ const __m128i final_fill0 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride);
+ const __m128i final_fill1 =
+ LoadUnaligned16(luma_ptr - kCflLumaBufferStride + 8);
+ __m128i wide_fill;
+ if (block_width_log2 == 5) {
+ // There are 16 16-bit fill values per row, shifting by 2 accounts for
+ // the widening to 32-bit.
+ wide_fill =
+ _mm_slli_epi32(_mm_cvtepi16_epi32(LastRowResult(final_fill1)), 2);
+ }
+ const __m128i final_inner_sum = _mm_add_epi16(final_fill0, final_fill1);
+ const __m128i final_inner_sum0 = _mm_cvtepu16_epi32(final_inner_sum);
+ const __m128i final_inner_sum1 = _mm_unpackhi_epi16(final_inner_sum, zero);
+ const __m128i final_fill_to_sum =
+ _mm_add_epi32(final_inner_sum0, final_inner_sum1);
+
+ do {
+ StoreUnaligned16(luma_ptr, final_fill0);
+ StoreUnaligned16(luma_ptr + 8, final_fill1);
+ if (block_width_log2 == 5) {
+ final_sum = _mm_add_epi32(final_sum, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ final_sum = _mm_add_epi32(final_sum, final_fill_to_sum);
+ } while (++y < block_height);
+ } // End second y section.
+
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 8));
+ final_sum = _mm_add_epi32(final_sum, _mm_srli_si128(final_sum, 4));
+
+ __m128i averages = RightShiftWithRounding_S32(
+ final_sum, block_width_log2 + block_height_log2);
+ averages = _mm_shufflelo_epi16(averages, 0);
+ averages = _mm_shuffle_epi32(averages, 0);
+
+ luma_ptr = luma[0];
+ y = block_height;
+ do {
+ const __m128i samples0 = LoadUnaligned16(luma_ptr);
+ StoreUnaligned16(luma_ptr, _mm_sub_epi16(samples0, averages));
+ const __m128i samples1 = LoadUnaligned16(luma_ptr + 8);
+ final_row_result = _mm_sub_epi16(samples1, averages);
+ StoreUnaligned16(luma_ptr + 8, final_row_result);
+
+ if (block_width_log2 == 5) {
+ const __m128i wide_fill = LastRowResult(final_row_result);
+ StoreUnaligned16(luma_ptr + 16, wide_fill);
+ StoreUnaligned16(luma_ptr + 24, wide_fill);
+ }
+ luma_ptr += kCflLumaBufferStride;
+ } while (--y != 0);
+}
+
+template <int block_width_log2, int block_height_log2>
+void CflSubsampler420_WxH_SSE4_1(
+ int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
+ const int max_luma_width, const int max_luma_height,
+ const void* const source, ptrdiff_t stride) {
+ switch (max_luma_width) {
+ case 8:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 8>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 16:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 16>(
+ luma, max_luma_height, source, stride);
+ return;
+ case 24:
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 24>(
+ luma, max_luma_height, source, stride);
+ return;
+ default:
+ assert(max_luma_width == 32);
+ CflSubsampler420Impl_WxH_SSE4_1<block_width_log2, block_height_log2, 32>(
+ luma, max_luma_height, source, stride);
+ return;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x4] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x8] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize4x16] =
+ CflIntraPredictor_10bpp_SSE4_1<4, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x4] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x8] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x16] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize8x32] =
+ CflIntraPredictor_10bpp_SSE4_1<8, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x4] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x8] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x16] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize16x32] =
+ CflIntraPredictor_10bpp_SSE4_1<16, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x8] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 8>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x16] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 16>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflIntraPredictor)
+ dsp->cfl_intra_predictors[kTransformSize32x32] =
+ CflIntraPredictor_10bpp_SSE4_1<32, 32>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType420] =
+ CflSubsampler420_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType420] =
+ CflSubsampler420_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler420)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType420] =
+ CflSubsampler420_WxH_SSE4_1<5, 5>;
+#endif
+
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x4][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x8][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize4x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize4x16][kSubsamplingType444] =
+ CflSubsampler444_4xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x4][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x8][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x16][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize8x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize8x32][kSubsamplingType444] =
+ CflSubsampler444_8xH_SSE4_1<5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x4_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x4][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 2>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize16x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize16x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<4, 5>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x8_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x8][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 3>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x16_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x16][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 4>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(TransformSize32x32_CflSubsampler444)
+ dsp->cfl_subsamplers[kTransformSize32x32][kSubsamplingType444] =
+ CflSubsampler444_WxH_SSE4_1<5, 5>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredCflInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -973,4 +1836,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intrapred_cfl_sse4.h b/libgav1/src/dsp/x86/intrapred_cfl_sse4.h
new file mode 100644
index 0000000..5d1a425
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_cfl_sse4.h
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::cfl_intra_predictors and Dsp::cfl_subsamplers, see the
+// defines below for specifics. These functions are not thread-safe.
+void IntraPredCflInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+//------------------------------------------------------------------------------
+// 10bpp
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor
+#define LIBGAV1_Dsp10bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_CFL_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.cc b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
new file mode 100644
index 0000000..e642aee
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.cc
@@ -0,0 +1,1478 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_directional.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/memory.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ return;
+ }
+ int y = 0;
+ do {
+ memcpy(dst, top + offset, width);
+ dst += stride;
+ memcpy(dst, top + offset + 1, width);
+ dst += stride;
+ memcpy(dst, top + offset + 2, width);
+ dst += stride;
+ memcpy(dst, top + offset + 3, width);
+ dst += stride;
+ memcpy(dst, top + offset + 4, width);
+ dst += stride;
+ memcpy(dst, top + offset + 5, width);
+ dst += stride;
+ memcpy(dst, top + offset + 6, width);
+ dst += stride;
+ memcpy(dst, top + offset + 7, width);
+ dst += stride;
+
+ offset += 8;
+ y += 8;
+ } while (y < height);
+}
+
+inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
+ const uint8_t* const top, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+ const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
+ : _mm_set_epi64x(0, 0x0403030202010100);
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ int y = 0;
+ int top_x = xstep;
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadLo8(top + top_base_x);
+ const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
+ prod = RightShiftWithRounding_U16(prod, rounding_bits);
+ // Replace pixels from invalid range with top-right corner.
+ prod = _mm_blendv_epi8(prod, final_top_val, past_max);
+ Store4(dst, _mm_packus_epi16(prod, prod));
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
+ // is always greater than |height|, so clipping to 1 is enough to make the
+ // logic work.
+ const int xstep_units = std::max(xstep >> scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ // Corner-only section of the row.
+ memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled);
+ return;
+ }
+ const __m128i sampler =
+ upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const int scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+ return;
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ int top_x = xstep;
+ int y = 0;
+ do {
+ int top_base_x = top_x >> scale_bits;
+
+ if (top_base_x >= max_base_x) {
+ for (int i = y; i < height; ++i) {
+ memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+ return;
+ }
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ for (; x < width - 8;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
+ // reading out of bounds. If all indices are past max and we don't need to
+ // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
+ // reset for the next |y|.
+ top_base_x &= ~_mm_cvtsi128_si32(past_max);
+ const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
+ __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ }
+ const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(top_row + top_base_x);
+ } else {
+ const __m128i top_vals = LoadLo8(top_row + top_base_x);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ vals = _mm_blendv_epi8(vals, final_top_val, past_max);
+ StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
+ dest += stride;
+ top_x += xstep;
+ } while (++y < height);
+}
+
+void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const int width, const int height,
+ const int xstep,
+ const bool upsampled_top) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ auto* dst = static_cast<uint8_t*>(dest);
+ DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
+ upsampled_top);
+}
+
+template <bool upsampled>
+inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[4];
+ for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadLo8(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadLo8(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ vals = RightShiftWithRounding_U16(vals, rounding_bits);
+ result_block[x] = _mm_packus_epi16(vals, vals);
+ }
+ const __m128i result = Transpose4x4_U8(result_block);
+ // This is result_row0.
+ Store4(dest, result);
+ dest += stride;
+ const int result_row1 = _mm_extract_epi32(result, 1);
+ memcpy(dest, &result_row1, sizeof(result_row1));
+ dest += stride;
+ const int result_row2 = _mm_extract_epi32(result, 2);
+ memcpy(dest, &result_row2, sizeof(result_row2));
+ dest += stride;
+ const int result_row3 = _mm_extract_epi32(result, 3);
+ memcpy(dest, &result_row3, sizeof(result_row3));
+}
+
+template <bool upsampled, int height>
+inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const left_column,
+ const int base_left_y, const int ystep) {
+ // For use in the non-upsampled case.
+ const __m128i sampler =
+ _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shift = _mm_set1_epi8(32);
+ // Downscaling for a weighted average whose weights sum to 32 (max_shift).
+ const int rounding_bits = 5;
+
+ __m128i result_block[8];
+ for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
+ const int left_base_y = left_y >> scale_bits;
+ const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi8(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
+ __m128i vals;
+ if (upsampled) {
+ vals = LoadUnaligned16(left_column + left_base_y);
+ } else {
+ const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
+ vals = _mm_shuffle_epi8(top_vals, sampler);
+ }
+ vals = _mm_maddubs_epi16(vals, shifts);
+ result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
+ }
+ Transpose8x8_U16(result_block, result_block);
+ for (int y = 0; y < height; ++y) {
+ StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (9) angle > 180
+void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
+ const void* const left_column,
+ const int width, const int height,
+ const int ystep,
+ const bool upsampled) {
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (width == 4 || height == 4) {
+ const ptrdiff_t stride4 = stride << 2;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<true>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
+ ystep);
+ dst_x += stride4;
+ y += 4;
+ } while (y < height);
+ left_y += ystep << 2;
+ x += 4;
+ } while (x < width);
+ }
+ return;
+ }
+
+ const ptrdiff_t stride8 = stride << 3;
+ if (upsampled) {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<true, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ } else {
+ int left_y = ystep;
+ int x = 0;
+ do {
+ uint8_t* dst_x = dst + x;
+ int y = 0;
+ do {
+ DirectionalZone3_8xH<false, 8>(
+ dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
+ dst_x += stride8;
+ y += 8;
+ } while (y < height);
+ left_y += ystep << 3;
+ x += 8;
+ } while (x < width);
+ }
+}
+
+//------------------------------------------------------------------------------
+// Directional Zone 2 Functions
+// 7.11.2.4 (8)
+
+// DirectionalBlend* selectively overwrites the values written by
+// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
+// row.
+template <int y_selector>
+inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds) {
+ const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
+ const __m128i& dest_index_vect,
+ const __m128i& vals,
+ const __m128i& zone_bounds,
+ const __m128i& bounds_selector) {
+ const __m128i max_dest_x_vect =
+ _mm_shuffle_epi8(zone_bounds, bounds_selector);
+ const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
+ const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
+ const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
+ StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
+}
+
+constexpr int kDirectionalWeightBits = 5;
+// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
+// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
+// shift) and shift. Shift is guaranteed to be between 0 and 32.
+inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
+ const __m128i& shifts,
+ const __m128i& sampler) {
+ const __m128i src_vals = LoadUnaligned16(source);
+ __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
+ vals = _mm_maddubs_epi16(vals, shifts);
+ return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
+}
+
+// Because the source values "move backwards" as the row index increases, the
+// indices derived from ystep are generally negative. This is accommodated by
+// making sure the relative indices are within [-15, 0] when the function is
+// called, and sliding them into the inclusive range [0, 15], relative to a
+// lower base address.
+constexpr int kPositiveIndexOffset = 15;
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
+ const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
+ // Left_column and sampler are both offset by 15 so the indices are always
+ // positive.
+ const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
+ for (int y = 0; y < 4; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
+ // can work as shuffle indices. Some values may be out of bounds, but their
+ // pred results will be masked over by top prediction.
+ sampler = _mm_add_epi8(sampler, positive_offset);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column + (y << upsample_shift), shifts, sampler);
+ Store4(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// The height at which a load of 16 bytes will not contain enough source pixels
+// from |left_column| to supply an accurate row when computing 8 pixels at a
+// time. The values are found by inspection. By coincidence, all angles that
+// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
+// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
+constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
+ 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
+
+template <bool upsampled>
+inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
+ uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
+ __m128i left_y) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int scale_bits = 6 - upsample_shift;
+ const __m128i max_shifts = _mm_set1_epi8(32);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ const __m128i index_increment = _mm_set1_epi8(1);
+ const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
+ for (int y = 0; y < 8; dst += stride, ++y) {
+ __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
+ offset_y = _mm_packs_epi16(offset_y, offset_y);
+ const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
+
+ // Offset the relative index because ystep is negative in Zone 2 and shuffle
+ // indices must be nonnegative.
+ __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
+ sampler = _mm_add_epi8(sampler, denegation);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+
+ // The specification adds (y << 6) to left_y, which is subject to
+ // upsampling, but this puts sampler indices out of the 0-15 range. It is
+ // equivalent to offset the source address by (y << upsample_shift) instead.
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
+ sampler);
+ StoreLo8(dst, _mm_packus_epi16(vals, vals));
+ }
+}
+
+// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
+// upsampled_top), for each row. When there are 4 values, they can be duplicated
+// with a non-register shuffle mask.
+// |shifts| is one pair of weights that applies throughout a given row.
+template <bool upsampled_top>
+inline void DirectionalZone1Blend_4x4(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+ top_x -= xstep;
+
+ int top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
+ DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
+ DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
+ DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
+ top_x -= xstep;
+ dest += stride;
+
+ top_base_x = (top_x >> scale_bits_x);
+ const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
+ DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
+}
+
+template <bool upsampled_top, int height>
+inline void DirectionalZone1Blend_8xH(
+ uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
+ __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
+ const __m128i& dest_index_x, int top_x, const int xstep) {
+ const int upsample_shift = static_cast<int>(upsampled_top);
+ const int scale_bits_x = 6 - upsample_shift;
+
+ __m128i y_selector = _mm_set1_epi32(0x01000100);
+ const __m128i index_increment = _mm_set1_epi32(0x02020202);
+ for (int y = 0; y < height; ++y,
+ y_selector = _mm_add_epi8(y_selector, index_increment),
+ dest += stride) {
+ top_x -= xstep;
+ const int top_base_x = top_x >> scale_bits_x;
+ const __m128i vals = DirectionalZone2FromSource_SSE4_1(
+ top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
+ DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
+ }
+}
+
+// 7.11.2.4 (8) 90 < angle > 180
+// The strategy for this function is to know how many blocks can be processed
+// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
+// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
+// approach is used for pred values from |left_column| in sections that permit
+// it.
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride8 = stride << 3;
+ const __m128i dest_index_x =
+ _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute. This assumes minimum |xstep| is 3.
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ // For steep angles, the source pixels from left_column may not fit in a
+ // 16-byte load for shuffling.
+ // TODO(petersonab): Find a more precise formula for this subject to x.
+ const int max_shuffle_height =
+ std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
+
+ const int xstep8 = xstep << 3;
+ const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
+ // Accumulate xstep across 8 rows.
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
+ const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep8 = ystep << 3;
+ const int left_base_increment8 = ystep8 >> 6;
+ const int ystep_remainder8 = ystep8 & 0x3F;
+ const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which is covered under the left_column
+ // offset. Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+
+ const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
+ int x = 0;
+
+ // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
+ // The first stage, before the first y-loop, covers blocks that are only
+ // computed from the top row. The second stage, comprising two y-loops, covers
+ // blocks that have a mixture of values computed from top or left. The final
+ // stage covers blocks that are only computed from the left.
+ for (int left_offset = -left_base_increment; x < min_top_only_x;
+ x += 8,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
+ // Watch left_y because it can still get big.
+ left_y = _mm_add_epi16(left_y, increment_left8),
+ left_offset -= left_base_increment8) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ DirectionalZone1_4xH(dst_x + 4, stride,
+ top_row + ((x + 4) << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute.
+ const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
+ // At high angles such that min_left_only_y < 8, ystep is low and xstep is
+ // high. This means that max_shuffle_height is unbounded and xstep_bounds
+ // will overflow in 16 bits. This is prevented by stopping the first
+ // blending loop at min_left_only_y for such cases, which means we skip over
+ // the second blending loop as well.
+ const int left_shuffle_stop_y =
+ std::min(max_shuffle_height, min_left_only_y);
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ for (; y < left_shuffle_stop_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Pick up from the last y-value, using the 10% slower but secure method for
+ // left prediction.
+ const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
+ for (; y < min_left_only_y;
+ y += 8, dst_x += stride8,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
+ top_x -= xstep8) {
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ DirectionalZone1Blend_8xH<upsampled_top, 8>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left_only rows.
+ for (; y < height; y += 8, dst_x += stride8) {
+ DirectionalZone3_8xH<upsampled_left, 8>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
+ -ystep);
+ }
+ }
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+template <bool upsampled_left, bool upsampled_top>
+inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
+ const uint8_t* const top_row,
+ const uint8_t* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep) {
+ auto* dst = static_cast<uint8_t*>(dest);
+ const int upsample_left_shift = static_cast<int>(upsampled_left);
+ const int upsample_top_shift = static_cast<int>(upsampled_top);
+ const __m128i max_shift = _mm_set1_epi8(32);
+ const ptrdiff_t stride4 = stride << 2;
+ const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
+ const __m128i sampler_top =
+ upsampled_top
+ ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
+ : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
+ // All columns from |min_top_only_x| to the right will only need |top_row| to
+ // compute.
+ assert(xstep >= 3);
+ const int min_top_only_x = std::min((height * xstep) >> 6, width);
+
+ const int xstep4 = xstep << 2;
+ const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
+ const __m128i xstep_dup = _mm_set1_epi16(-xstep);
+ const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
+ __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
+ const __m128i scaled_one = _mm_set1_epi16(-64);
+ // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
+ __m128i xstep_bounds_base =
+ (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
+ : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
+
+ const int left_base_increment = ystep >> 6;
+ const int ystep_remainder = ystep & 0x3F;
+ const int ystep4 = ystep << 2;
+ const int left_base_increment4 = ystep4 >> 6;
+ // This is guaranteed to be less than 64, but accumulation may bring it past
+ // 64 for higher x values.
+ const int ystep_remainder4 = ystep4 & 0x3F;
+ const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
+ const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
+
+ // If the 64 scaling is regarded as a decimal point, the first value of the
+ // left_y vector omits the portion which will go into the left_column offset.
+ // Following values need the full ystep as a relative offset.
+ const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
+ const __m128i ystep_dup = _mm_set1_epi16(-ystep);
+ __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
+ left_y = _mm_add_epi16(ystep_init, left_y);
+ const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
+
+ int x = 0;
+ // Loop over x for columns with a mixture of sources.
+ for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
+ xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
+ left_y = _mm_add_epi16(left_y, increment_left4),
+ left_offset -= left_base_increment4) {
+ uint8_t* dst_x = dst + x;
+
+ // Round down to the nearest multiple of 8.
+ const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
+ DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
+ max_top_only_y, -xstep, upsampled_top);
+ int y = max_top_only_y;
+ dst_x += stride * y;
+ const int xstep_y = xstep * y;
+ const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
+ // All rows from |min_left_only_y| down for this set of columns, only need
+ // |left_column| to compute. Rounded up to the nearest multiple of 4.
+ const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
+
+ __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
+ __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
+ int top_x = -xstep_y;
+
+ // Loop over y for mixed rows.
+ for (; y < min_left_only_y;
+ y += 4, dst_x += stride4,
+ xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
+ xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
+ top_x -= xstep4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) * (1 << upsample_left_shift)),
+ left_y);
+
+ __m128i shifts = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
+ shift_mask),
+ 1);
+ shifts = _mm_packus_epi16(shifts, shifts);
+ const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
+ shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
+ const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
+ DirectionalZone1Blend_4x4<upsampled_top>(
+ dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
+ xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
+ }
+ // Loop over y for left-only rows, if any.
+ for (; y < height; y += 4, dst_x += stride4) {
+ DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
+ dst_x, stride,
+ left_column + ((left_offset + y) << upsample_left_shift), left_y);
+ }
+ }
+ // Loop over top-only columns, if any.
+ for (; x < width; x += 4) {
+ DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
+ height, -xstep, upsampled_top);
+ }
+}
+
+void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ const int width, const int height,
+ const int xstep, const int ystep,
+ const bool upsampled_top,
+ const bool upsampled_left) {
+ // Increasing the negative buffer for this function allows more rows to be
+ // processed at a time without branching in an inner loop to check the base.
+ uint8_t top_buffer[288];
+ uint8_t left_buffer[288];
+ memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
+ memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
+ const uint8_t* top_ptr = top_buffer + 144;
+ const uint8_t* left_ptr = left_buffer + 144;
+ if (width == 4 || height == 4) {
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+ return;
+ }
+ if (upsampled_left) {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ } else {
+ if (upsampled_top) {
+ DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ } else {
+ DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
+ width, height, xstep, ystep);
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
+ dsp->directional_intra_predictor_zone2 =
+ DirectionalIntraPredictorZone2_SSE4_1;
+#endif
+#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
+ dsp->directional_intra_predictor_zone3 =
+ DirectionalIntraPredictorZone3_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+//------------------------------------------------------------------------------
+// 7.11.2.4. Directional intra prediction process
+
+// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
+// upsampling is ruled out. In addition, the bits masked by 0x3F for
+// |shift_val| are 0 for all multiples of 64, so the formula
+// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
+// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
+// involved in the output. Hence |top| is offset by 1.
+inline void DirectionalZone1_Step64(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int width,
+ const int height) {
+ ptrdiff_t offset = 1;
+ if (height == 4) {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ return;
+ }
+ int y = height;
+ do {
+ memcpy(dst, top + offset, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 1, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 2, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 3, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 4, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 5, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 6, width * sizeof(dst[0]));
+ dst += stride;
+ memcpy(dst, top + offset + 7, width * sizeof(dst[0]));
+ dst += stride;
+
+ offset += 8;
+ y -= 8;
+ } while (y != 0);
+}
+
+// Produce a weighted average whose weights sum to 32.
+inline __m128i CombineTopVals4(const __m128i& top_vals, const __m128i& sampler,
+ const __m128i& shifts,
+ const __m128i& top_indices,
+ const __m128i& final_top_val,
+ const __m128i& border_index) {
+ const __m128i sampled_values = _mm_shuffle_epi8(top_vals, sampler);
+ __m128i prod = _mm_mullo_epi16(sampled_values, shifts);
+ prod = _mm_hadd_epi16(prod, prod);
+ const __m128i result = RightShiftWithRounding_U16(prod, 5 /*log2(32)*/);
+
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+}
+
+// When width is 4, only one load operation is needed per iteration. We also
+// avoid extra loop precomputations that cause too much overhead.
+inline void DirectionalZone1_4xH(uint16_t* dst, ptrdiff_t stride,
+ const uint16_t* const top, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" because
+ // only cmpgt is available.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int y = 0;
+ int top_x = xstep;
+ const __m128i max_shift = _mm_set1_epi16(32);
+
+ for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
+ const int top_base_x = top_x >> index_scale_bits;
+
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ // Load 8 values because we will select the sampled values based on
+ // |upsampled|.
+ const __m128i values = LoadUnaligned16(top + top_base_x);
+ const __m128i pred =
+ CombineTopVals4(values, sampler, shifts, top_index_vect, final_top_val,
+ max_base_x_vect);
+ StoreLo8(dst, pred);
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dst, top[max_base_x], /* width */ 4);
+ dst += stride;
+ }
+}
+
+// General purpose combine function.
+// |check_border| means the final source value has to be duplicated into the
+// result. This simplifies the loop structures that use precomputed boundaries
+// to identify sections where it is safe to compute without checking for the
+// right border.
+template <bool check_border>
+inline __m128i CombineTopVals(
+ const __m128i& top_vals_0, const __m128i& top_vals_1,
+ const __m128i& sampler, const __m128i& shifts,
+ const __m128i& top_indices = _mm_setzero_si128(),
+ const __m128i& final_top_val = _mm_setzero_si128(),
+ const __m128i& border_index = _mm_setzero_si128()) {
+ constexpr int scale_int_bits = 5;
+ const __m128i sampled_values_0 = _mm_shuffle_epi8(top_vals_0, sampler);
+ const __m128i sampled_values_1 = _mm_shuffle_epi8(top_vals_1, sampler);
+ const __m128i prod_0 = _mm_mullo_epi16(sampled_values_0, shifts);
+ const __m128i prod_1 = _mm_mullo_epi16(sampled_values_1, shifts);
+ const __m128i combined = _mm_hadd_epi16(prod_0, prod_1);
+ const __m128i result = RightShiftWithRounding_U16(combined, scale_int_bits);
+ if (check_border) {
+ const __m128i past_max = _mm_cmpgt_epi16(top_indices, border_index);
+ // Replace pixels from invalid range with top-right corner.
+ return _mm_blendv_epi8(result, final_top_val, past_max);
+ }
+ return result;
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalZone1_Large(uint16_t* dest, ptrdiff_t stride,
+ const uint16_t* const top_row,
+ const int width, const int height,
+ const int xstep, const bool upsampled,
+ const __m128i& sampler) {
+ const int upsample_shift = static_cast<int>(upsampled);
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping to 1 is enough
+ // to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ // Rows up to this y-value can be computed without checking for bounds.
+ const int max_no_corner_y = std::min(
+ LeftShift((max_base_x - (base_step * width)), index_scale_bits) / xstep,
+ height);
+ // No need to check for exceeding |max_base_x| in the first loop.
+ int y = 0;
+ int top_x = xstep;
+ for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ }
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to |top_base_x|, it is used to mask values
+ // that pass the end of the |top| buffer. Starting from 1 to simulate "cmpge"
+ // which is not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ int x = 0;
+ const int min_corner_only_x =
+ std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
+ for (; x < min_corner_only_x;
+ x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ // Corner-only section of the row.
+ Memset(dest + x, top_row[max_base_x], width - x);
+ }
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+// 7.11.2.4 (7) angle < 90
+inline void DirectionalIntraPredictorZone1_SSE4_1(
+ void* dest_ptr, ptrdiff_t stride, const void* const top_ptr,
+ const int width, const int height, const int xstep, const bool upsampled) {
+ const auto* const top_row = static_cast<const uint16_t*>(top_ptr);
+ auto* dest = static_cast<uint16_t*>(dest_ptr);
+ stride /= sizeof(uint16_t);
+ const int upsample_shift = static_cast<int>(upsampled);
+ if (xstep == 64) {
+ DirectionalZone1_Step64(dest, stride, top_row, width, height);
+ return;
+ }
+ // Each base pixel paired with its following pixel, for hadd purposes.
+ const __m128i adjacency_shuffler = _mm_set_epi16(
+ 0x0908, 0x0706, 0x0706, 0x0504, 0x0504, 0x0302, 0x0302, 0x0100);
+ // This is equivalent to not shuffling at all.
+ const __m128i identity_shuffler = _mm_set_epi16(
+ 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100);
+ // This represents a trade-off between code size and speed. When upsampled
+ // is true, no shuffle is necessary. But to avoid in-loop branching, we
+ // would need 2 copies of the main function body.
+ const __m128i sampler = upsampled ? identity_shuffler : adjacency_shuffler;
+ if (width == 4) {
+ DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled,
+ sampler);
+ return;
+ }
+ if (width >= 32) {
+ DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
+ upsampled, sampler);
+ return;
+ }
+ const int index_scale_bits = 6 - upsample_shift;
+ const int max_base_x = ((width + height) - 1) << upsample_shift;
+
+ const __m128i max_shift = _mm_set1_epi16(32);
+ const int base_step = 1 << upsample_shift;
+ const int base_step8 = base_step << 3;
+
+ // No need to check for exceeding |max_base_x| in the loops.
+ if (((xstep * height) >> index_scale_bits) + base_step * width < max_base_x) {
+ int top_x = xstep;
+ int y = height;
+ do {
+ int top_base_x = top_x >> index_scale_bits;
+ // Permit negative values of |top_x|.
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ int x = 0;
+ do {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<false>(top_vals_0, top_vals_1, sampler, shifts);
+ StoreUnaligned16(dest + x, pred);
+ top_base_x += base_step8;
+ x += 8;
+ } while (x < width);
+ dest += stride;
+ top_x += xstep;
+ } while (--y != 0);
+ return;
+ }
+
+ // General case. Blocks with width less than 32 do not benefit from x-wise
+ // loop splitting, but do benefit from using memset on appropriate rows.
+
+ // Each 16-bit value here corresponds to a position that may exceed
+ // |max_base_x|. When added to the top_base_x, it is used to mask values
+ // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
+ // not supported for packed integers.
+ const __m128i offsets =
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+
+ const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
+ const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
+ const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
+
+ // All rows from |min_corner_only_y| down will simply use memcpy.
+ // |max_base_x| is always greater than |height|, so clipping the denominator
+ // to 1 is enough to make the logic work.
+ const int xstep_units = std::max(xstep >> index_scale_bits, 1);
+ const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
+
+ int top_x = xstep;
+ int y = 0;
+ for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
+ int top_base_x = top_x >> index_scale_bits;
+
+ const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
+ const __m128i shift = _mm_set1_epi16(shift_val);
+ const __m128i opposite_shift = _mm_sub_epi16(max_shift, shift);
+ const __m128i shifts = _mm_unpacklo_epi16(opposite_shift, shift);
+ __m128i top_index_vect = _mm_set1_epi16(top_base_x);
+ top_index_vect = _mm_add_epi16(top_index_vect, offsets);
+
+ for (int x = 0; x < width; x += 8, top_base_x += base_step8,
+ top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
+ const __m128i top_vals_0 = LoadUnaligned16(top_row + top_base_x);
+ const __m128i top_vals_1 =
+ LoadUnaligned16(top_row + top_base_x + (4 << upsample_shift));
+ const __m128i pred =
+ CombineTopVals<true>(top_vals_0, top_vals_1, sampler, shifts,
+ top_index_vect, final_top_val, max_base_x_vect);
+ StoreUnaligned16(dest + x, pred);
+ }
+ }
+
+ // Fill in corner-only rows.
+ for (; y < height; ++y) {
+ Memset(dest, top_row[max_base_x], width);
+ dest += stride;
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(DirectionalIntraPredictorZone1)
+ dsp->directional_intra_predictor_zone1 =
+ DirectionalIntraPredictorZone1_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void IntraPredDirectionalInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredDirectionalInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intrapred_directional_sse4.h b/libgav1/src/dsp/x86/intrapred_directional_sse4.h
new file mode 100644
index 0000000..b352450
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_directional_sse4.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::directional_intra_predictor_zone*, see the defines below for
+// specifics. These functions are not thread-safe.
+void IntraPredDirectionalInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
+#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1
+#define LIBGAV1_Dsp10bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_DIRECTIONAL_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intrapred_filter_sse4.cc b/libgav1/src/dsp/x86/intrapred_filter_sse4.cc
new file mode 100644
index 0000000..022af8d
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_filter_sse4.cc
@@ -0,0 +1,432 @@
+// Copyright 2021 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/intrapred_filter.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1
+
+#include <xmmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+//------------------------------------------------------------------------------
+// FilterIntraPredictor_SSE4_1
+// Section 7.11.2.3. Recursive intra prediction process
+// This filter applies recursively to 4x2 sub-blocks within the transform block,
+// meaning that the predicted pixels in each sub-block are used as inputs to
+// sub-blocks below and to the right, if present.
+//
+// Each output value in the sub-block is predicted by a different filter applied
+// to the same array of top-left, top, and left values. If fn refers to the
+// output of the nth filter, given this block:
+// TL T0 T1 T2 T3
+// L0 f0 f1 f2 f3
+// L1 f4 f5 f6 f7
+// The filter input order is p0, p1, p2, p3, p4, p5, p6:
+// p0 p1 p2 p3 p4
+// p5 f0 f1 f2 f3
+// p6 f4 f5 f6 f7
+// Filters usually apply to 8 values for convenience, so in this case we fix
+// the 8th filter tap to 0 and disregard the value of the 8th input.
+
+// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
+// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
+constexpr int kDuplicateFirstHalf = 0x44;
+
+// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
+// at zero to preserve the sum.
+// |pixels| contains p0-p7 in order as shown above.
+// |taps_0_1| contains the filter kernels used to predict f0 and f1, and so on.
+inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
+ const __m128i& pixels, const __m128i& taps_0_1,
+ const __m128i& taps_2_3, const __m128i& taps_4_5,
+ const __m128i& taps_6_7) {
+ const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
+ const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
+ // |output_half| contains 8 partial sums for f0-f7.
+ __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
+ __m128i output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row0 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* unused half */ output);
+ Store4(dst, output_row0);
+ const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
+ const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
+ output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
+ output = _mm_hadd_epi16(output_half, output_half);
+ const __m128i output_row1 =
+ _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
+ /* arbitrary pack arg */ output);
+ Store4(dst + stride, output_row1);
+}
+
+// 4xH transform sizes are given special treatment because LoadLo8 goes out
+// of bounds and every block involves the left column. The top-left pixel, p0,
+// is stored in the top buffer for the first 4x2, but comes from the left buffer
+// for successive blocks. This implementation takes advantage of the fact
+// that the p5 and p6 for each sub-block come solely from the |left_ptr| buffer,
+// using shifts to arrange things to fit reusable shuffle vectors.
+inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
+ const uint8_t* const top_ptr,
+ const uint8_t* const left_ptr, FilterIntraPredictor pred,
+ const int height) {
+ // Two filter kernels per vector.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+ __m128i top = Load4(top_ptr - 1);
+ __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
+ __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
+ left = _mm_slli_si128(left, 5);
+
+ // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
+ // left[2], left[3], left[4], left[5], left[6], left[7]
+ // Let rn represent a pixel usable as pn for the 4x2 after this one. We get:
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p0 p1 p2 p3 p4 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // Two sets of the same input pixels to apply two filters at once.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 1.
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
+ // left[0], left[1], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
+ // byte is an unused value, which shall be multiplied by 0 when we apply the
+ // filter.
+ constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
+
+ // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 2.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 3.
+
+ // Compute the middle 8 rows before using common code for the final 4 rows, in
+ // order to fit the assumption that |left| has the next TL at position 8.
+ if (height == 16) {
+ // This shift allows us to use pixel_order2 twice after shifting by 2 later.
+ left = _mm_slli_si128(left, 1);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
+ // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+
+ // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
+ // last byte is an unused value, as above. The top-left was shifted to
+ // position nine to keep two empty spaces after the top pixels.
+ constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
+
+ // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
+ // the end.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 4.
+
+ // First 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Clear all but final pixel in the first 8 of left column.
+ __m128i keep_top_left = _mm_srli_si128(left, 13);
+ dest += stride; // Move to y = 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
+ // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx xx xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ left = LoadLo8(left_ptr + 8);
+
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ dest += stride; // Move to y = 6.
+
+ // Second 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // Position TL value so we can use pixel_order1.
+ keep_top_left = _mm_slli_si128(keep_top_left, 6);
+ dest += stride; // Move to y = 7.
+ pixels = Load4(dest);
+ left = _mm_slli_si128(left, 7);
+ left = _mm_or_si128(left, keep_top_left);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 8.
+
+ // Third 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 9.
+
+ // Prepare final inputs.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 10.
+
+ // Fourth 4x2 in the if body.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 11.
+ }
+
+ // In both the 8 and 16 case at this point, we can assume that |left| has the
+ // next TL at position 8.
+ if (height > 4) {
+ // Erase prior left pixels by shifting TL to position 0.
+ left = _mm_srli_si128(left, 8);
+ left = _mm_slli_si128(left, 6);
+ pixels = Load4(dest);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 12 or 4.
+
+ // First of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dest += stride; // Move to y = 13 or 5.
+ pixels = Load4(dest);
+ left = _mm_srli_si128(left, 2);
+
+ // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
+ // left[-1], left[0], left[1], left[2], left[3], ...
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ // p1 p2 p3 p4 xx xx p0 p5 p6 r5 r6 ...
+ // r0
+ pixels = _mm_or_si128(left, pixels);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ dest += stride; // Move to y = 14 or 6.
+
+ // Last of final two 4x2 blocks.
+ Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ }
+}
+
+void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
+ const void* const top_row,
+ const void* const left_column,
+ FilterIntraPredictor pred, const int width,
+ const int height) {
+ const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
+ const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (width == 4) {
+ Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
+ return;
+ }
+
+ // There is one set of 7 taps for each of the 4x2 output pixels.
+ const __m128i taps_0_1 = LoadAligned16(kFilterIntraTaps[pred][0]);
+ const __m128i taps_2_3 = LoadAligned16(kFilterIntraTaps[pred][2]);
+ const __m128i taps_4_5 = LoadAligned16(kFilterIntraTaps[pred][4]);
+ const __m128i taps_6_7 = LoadAligned16(kFilterIntraTaps[pred][6]);
+
+ // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
+ // the end is an unused value, which shall be multiplied by 0 when we apply
+ // the filter.
+ constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
+
+ // Takes the "left section" and puts it right after p0-p4.
+ const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
+
+ // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
+ // byte is unused as above.
+ constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
+
+ // Shuffles the "top left" from the left section, to the front. Used when
+ // grabbing data from left_column and not top_row.
+ const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
+
+ // This first pass takes care of the cases where the top left pixel comes from
+ // top_row.
+ __m128i pixels = LoadLo8(top_ptr - 1);
+ __m128i left = _mm_slli_si128(Load4(left_column), 8);
+ pixels = _mm_or_si128(pixels, left);
+
+ // Two sets of the same pixels to multiply with two sets of taps.
+ pixels = _mm_shuffle_epi8(pixels, pixel_order1);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
+ left = _mm_srli_si128(left, 1);
+
+ // Load
+ pixels = Load4(dst + stride);
+
+ // Because of the above shift, this OR 'invades' the final of the first 8
+ // bytes of |pixels|. This is acceptable because the 8th filter tap is always
+ // a padded 0.
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ const ptrdiff_t stride2 = stride << 1;
+ const ptrdiff_t stride4 = stride << 2;
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ dst += 4;
+ for (int x = 3; x < width - 4; x += 4) {
+ pixels = Load4(top_ptr + x);
+ pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+
+ // Now we handle heights that reference previous blocks rather than top_row.
+ for (int y = 4; y < height; y += 4) {
+ // Leftmost 4x4 block for this height.
+ dst -= width;
+ dst += stride4;
+
+ // Top Left is not available by offset in these leftmost blocks.
+ pixels = Load4(dst - stride);
+ left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
+ left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+
+ // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
+ left = _mm_srli_si128(left, 2);
+ pixels = Load4(dst + stride);
+ pixels = _mm_or_si128(pixels, left);
+ pixels = _mm_shuffle_epi8(pixels, pixel_order2);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+
+ dst += 4;
+
+ // Remaining 4x4 blocks for this height.
+ for (int x = 4; x < width; x += 4) {
+ pixels = Load4(dst - stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[-1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
+ taps_6_7);
+ pixels = Load4(dst + stride - 1);
+ pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
+ pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
+
+ // Duplicate bottom half into upper half.
+ pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
+ Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
+ taps_4_5, taps_6_7);
+ dst += 4;
+ }
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+// These guards check if this version of the function was not superseded by
+// a higher optimization level, such as AVX. The corresponding #define also
+// prevents the C version from being added to the table.
+#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
+ dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
+#endif
+}
+
+} // namespace
+
+void IntraPredFilterInit_SSE4_1() { Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_SSE4_1
+namespace libgav1 {
+namespace dsp {
+
+void IntraPredFilterInit_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intrapred_filter_sse4.h b/libgav1/src/dsp/x86/intrapred_filter_sse4.h
new file mode 100644
index 0000000..ce28f93
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_filter_sse4.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::filter_intra_predictor, see the defines below for specifics.
+// These functions are not thread-safe.
+void IntraPredFilterInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
+#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_FILTER_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc b/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc
index a761813..de9f551 100644
--- a/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_smooth_sse4.cc
@@ -12,22 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "src/dsp/intrapred.h"
+#include "src/dsp/intrapred_smooth.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -67,29 +67,6 @@
Store4(dest, _mm_shuffle_epi8(pred, cvtepi32_epi8));
}
-template <int y_mask>
-inline __m128i SmoothVerticalSum4(const __m128i& top, const __m128i& weights,
- const __m128i& scaled_bottom_left) {
- const __m128i weights_y = _mm_shuffle_epi32(weights, y_mask);
- const __m128i weighted_top_y = _mm_mullo_epi16(top, weights_y);
- const __m128i scaled_bottom_left_y =
- _mm_shuffle_epi32(scaled_bottom_left, y_mask);
- return _mm_add_epi32(scaled_bottom_left_y, weighted_top_y);
-}
-
-template <int y_mask>
-inline void WriteSmoothVerticalSum4(uint8_t* dest, const __m128i& top,
- const __m128i& weights,
- const __m128i& scaled_bottom_left,
- const __m128i& round) {
- __m128i pred_sum =
- SmoothVerticalSum4<y_mask>(top, weights, scaled_bottom_left);
- // Equivalent to RightShiftWithRounding(pred[x][y], 8).
- pred_sum = _mm_srli_epi32(_mm_add_epi32(pred_sum, round), 8);
- const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
- Store4(dest, _mm_shuffle_epi8(pred_sum, cvtepi32_epi8));
-}
-
// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
// |pixels| is a segment of the top row or the whole top row, and |weights| is
// repeated.
@@ -2649,7 +2626,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -2659,4 +2636,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intrapred_smooth_sse4.h b/libgav1/src/dsp/x86/intrapred_smooth_sse4.h
new file mode 100644
index 0000000..9353371
--- /dev/null
+++ b/libgav1/src/dsp/x86/intrapred_smooth_sse4.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2021 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+#define LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::intra_predictors[][kIntraPredictorSmooth.*].
+// This function is not thread-safe.
+void IntraPredSmoothInit_SSE4_1();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If sse4 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the sse4 implementation should be used.
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
+#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
+ LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
+
+#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SMOOTH_SSE4_H_
diff --git a/libgav1/src/dsp/x86/intrapred_sse4.cc b/libgav1/src/dsp/x86/intrapred_sse4.cc
index 11ba9aa..063929d 100644
--- a/libgav1/src/dsp/x86/intrapred_sse4.cc
+++ b/libgav1/src/dsp/x86/intrapred_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/intrapred.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
@@ -23,13 +23,14 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstring> // memcpy
+#include <cstring>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
#include "src/dsp/x86/transpose_sse4.h"
#include "src/utils/common.h"
+#include "src/utils/constants.h"
namespace libgav1 {
namespace dsp {
@@ -51,10 +52,6 @@
return _mm_mulhi_epi16(interm, _mm_cvtsi32_si128(multiplier));
}
-// This shuffle mask selects 32-bit blocks in the order 0, 1, 0, 1, which
-// duplicates the first 8 bytes of a 128-bit vector into the second 8 bytes.
-constexpr int kDuplicateFirstHalf = 0x44;
-
//------------------------------------------------------------------------------
// DcPredFuncs_SSE4_1
@@ -1408,1337 +1405,6 @@
WritePaeth16x16(dst + 48, stride, top_left, top_3, left_3);
}
-//------------------------------------------------------------------------------
-// 7.11.2.4. Directional intra prediction process
-
-// Special case: An |xstep| of 64 corresponds to an angle delta of 45, meaning
-// upsampling is ruled out. In addition, the bits masked by 0x3F for
-// |shift_val| are 0 for all multiples of 64, so the formula
-// val = top[top_base_x]*shift + top[top_base_x+1]*(32-shift), reduces to
-// val = top[top_base_x+1] << 5, meaning only the second set of pixels is
-// involved in the output. Hence |top| is offset by 1.
-inline void DirectionalZone1_Step64(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int width,
- const int height) {
- ptrdiff_t offset = 1;
- if (height == 4) {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- return;
- }
- int y = 0;
- do {
- memcpy(dst, top + offset, width);
- dst += stride;
- memcpy(dst, top + offset + 1, width);
- dst += stride;
- memcpy(dst, top + offset + 2, width);
- dst += stride;
- memcpy(dst, top + offset + 3, width);
- dst += stride;
- memcpy(dst, top + offset + 4, width);
- dst += stride;
- memcpy(dst, top + offset + 5, width);
- dst += stride;
- memcpy(dst, top + offset + 6, width);
- dst += stride;
- memcpy(dst, top + offset + 7, width);
- dst += stride;
-
- offset += 8;
- y += 8;
- } while (y < height);
-}
-
-inline void DirectionalZone1_4xH(uint8_t* dst, ptrdiff_t stride,
- const uint8_t* const top, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const int rounding_bits = 5;
- const int max_base_x = (height + 3 /* width - 1 */) << upsample_shift;
- const __m128i final_top_val = _mm_set1_epi16(top[max_base_x]);
- const __m128i sampler = upsampled ? _mm_set_epi64x(0, 0x0706050403020100)
- : _mm_set_epi64x(0, 0x0403030202010100);
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- int y = 0;
- int top_x = xstep;
-
- for (; y < min_corner_only_y; ++y, dst += stride, top_x += xstep) {
- const int top_base_x = top_x >> scale_bits;
-
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i max_shift = _mm_set1_epi8(32);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
-
- // Load 8 values because we will select the sampled values based on
- // |upsampled|.
- const __m128i values = LoadLo8(top + top_base_x);
- const __m128i sampled_values = _mm_shuffle_epi8(values, sampler);
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i prod = _mm_maddubs_epi16(sampled_values, shifts);
- prod = RightShiftWithRounding_U16(prod, rounding_bits);
- // Replace pixels from invalid range with top-right corner.
- prod = _mm_blendv_epi8(prod, final_top_val, past_max);
- Store4(dst, _mm_packus_epi16(prod, prod));
- }
-
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dst, top[max_base_x], /* width */ 4);
- dst += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_Large(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // All rows from |min_corner_only_y| down will simply use memcpy. |max_base_x|
- // is always greater than |height|, so clipping to 1 is enough to make the
- // logic work.
- const int xstep_units = std::max(xstep >> scale_bits, 1);
- const int min_corner_only_y = std::min(max_base_x / xstep_units, height);
-
- // Rows up to this y-value can be computed without checking for bounds.
- const int max_no_corner_y = std::min(
- LeftShift((max_base_x - (base_step * width)), scale_bits) / xstep,
- height);
- // No need to check for exceeding |max_base_x| in the first loop.
- int y = 0;
- int top_x = xstep;
- for (; y < max_no_corner_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- for (; y < min_corner_only_y; ++y, dest += stride, top_x += xstep) {
- int top_base_x = top_x >> scale_bits;
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- const int min_corner_only_x =
- std::min(width, ((max_base_x - top_base_x) >> upsample_shift) + 7) & ~7;
- for (; x < min_corner_only_x;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- // Corner-only section of the row.
- memset(dest + x, top_row[max_base_x], width - x);
- }
- // Fill in corner-only rows.
- for (; y < height; ++y) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
-}
-
-// 7.11.2.4 (7) angle < 90
-inline void DirectionalZone1_SSE4_1(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const int width, const int height,
- const int xstep, const bool upsampled) {
- const int upsample_shift = static_cast<int>(upsampled);
- if (xstep == 64) {
- DirectionalZone1_Step64(dest, stride, top_row, width, height);
- return;
- }
- if (width == 4) {
- DirectionalZone1_4xH(dest, stride, top_row, height, xstep, upsampled);
- return;
- }
- if (width >= 32) {
- DirectionalZone1_Large(dest, stride, top_row, width, height, xstep,
- upsampled);
- return;
- }
- const __m128i sampler =
- upsampled ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const int scale_bits = 6 - upsample_shift;
- const int max_base_x = ((width + height) - 1) << upsample_shift;
-
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
- const int base_step = 1 << upsample_shift;
- const int base_step8 = base_step << 3;
-
- // No need to check for exceeding |max_base_x| in the loops.
- if (((xstep * height) >> scale_bits) + base_step * width < max_base_x) {
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
- // Permit negative values of |top_x|.
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- int x = 0;
- do {
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- top_base_x += base_step8;
- x += 8;
- } while (x < width);
- dest += stride;
- top_x += xstep;
- } while (++y < height);
- return;
- }
-
- // Each 16-bit value here corresponds to a position that may exceed
- // |max_base_x|. When added to the top_base_x, it is used to mask values
- // that pass the end of |top|. Starting from 1 to simulate "cmpge" which is
- // not supported for packed integers.
- const __m128i offsets =
- _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
-
- const __m128i max_base_x_vect = _mm_set1_epi16(max_base_x);
- const __m128i final_top_val = _mm_set1_epi16(top_row[max_base_x]);
- const __m128i base_step8_vect = _mm_set1_epi16(base_step8);
- int top_x = xstep;
- int y = 0;
- do {
- int top_base_x = top_x >> scale_bits;
-
- if (top_base_x >= max_base_x) {
- for (int i = y; i < height; ++i) {
- memset(dest, top_row[max_base_x], width);
- dest += stride;
- }
- return;
- }
-
- const int shift_val = (LeftShift(top_x, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i top_index_vect = _mm_set1_epi16(top_base_x);
- top_index_vect = _mm_add_epi16(top_index_vect, offsets);
-
- int x = 0;
- for (; x < width - 8;
- x += 8, top_base_x += base_step8,
- top_index_vect = _mm_add_epi16(top_index_vect, base_step8_vect)) {
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- // Assuming a buffer zone of 8 bytes at the end of top_row, this prevents
- // reading out of bounds. If all indices are past max and we don't need to
- // use the loaded bytes at all, |top_base_x| becomes 0. |top_base_x| will
- // reset for the next |y|.
- top_base_x &= ~_mm_cvtsi128_si32(past_max);
- const __m128i top_vals = LoadUnaligned16(top_row + top_base_x);
- __m128i vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- }
- const __m128i past_max = _mm_cmpgt_epi16(top_index_vect, max_base_x_vect);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(top_row + top_base_x);
- } else {
- const __m128i top_vals = LoadLo8(top_row + top_base_x);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- vals = _mm_insert_epi8(vals, top_row[top_base_x + 8], 15);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- vals = _mm_blendv_epi8(vals, final_top_val, past_max);
- StoreLo8(dest + x, _mm_packus_epi16(vals, vals));
- dest += stride;
- top_x += xstep;
- } while (++y < height);
-}
-
-void DirectionalIntraPredictorZone1_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const int width, const int height,
- const int xstep,
- const bool upsampled_top) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- auto* dst = static_cast<uint8_t*>(dest);
- DirectionalZone1_SSE4_1(dst, stride, top_ptr, width, height, xstep,
- upsampled_top);
-}
-
-template <bool upsampled>
-inline void DirectionalZone3_4x4(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler = _mm_set_epi64x(0, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[4];
- for (int x = 0, left_y = base_left_y; x < 4; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = ((left_y << upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadLo8(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadLo8(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- vals = RightShiftWithRounding_U16(vals, rounding_bits);
- result_block[x] = _mm_packus_epi16(vals, vals);
- }
- const __m128i result = Transpose4x4_U8(result_block);
- // This is result_row0.
- Store4(dest, result);
- dest += stride;
- const int result_row1 = _mm_extract_epi32(result, 1);
- memcpy(dest, &result_row1, sizeof(result_row1));
- dest += stride;
- const int result_row2 = _mm_extract_epi32(result, 2);
- memcpy(dest, &result_row2, sizeof(result_row2));
- dest += stride;
- const int result_row3 = _mm_extract_epi32(result, 3);
- memcpy(dest, &result_row3, sizeof(result_row3));
-}
-
-template <bool upsampled, int height>
-inline void DirectionalZone3_8xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const left_column,
- const int base_left_y, const int ystep) {
- // For use in the non-upsampled case.
- const __m128i sampler =
- _mm_set_epi64x(0x0807070606050504, 0x0403030202010100);
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shift = _mm_set1_epi8(32);
- const int rounding_bits = 5;
-
- __m128i result_block[8];
- for (int x = 0, left_y = base_left_y; x < 8; x++, left_y += ystep) {
- const int left_base_y = left_y >> scale_bits;
- const int shift_val = (LeftShift(left_y, upsample_shift) & 0x3F) >> 1;
- const __m128i shift = _mm_set1_epi8(shift_val);
- const __m128i opposite_shift = _mm_sub_epi8(max_shift, shift);
- const __m128i shifts = _mm_unpacklo_epi8(opposite_shift, shift);
- __m128i vals;
- if (upsampled) {
- vals = LoadUnaligned16(left_column + left_base_y);
- } else {
- const __m128i top_vals = LoadUnaligned16(left_column + left_base_y);
- vals = _mm_shuffle_epi8(top_vals, sampler);
- }
- vals = _mm_maddubs_epi16(vals, shifts);
- result_block[x] = RightShiftWithRounding_U16(vals, rounding_bits);
- }
- Transpose8x8_U16(result_block, result_block);
- for (int y = 0; y < height; ++y) {
- StoreLo8(dest, _mm_packus_epi16(result_block[y], result_block[y]));
- dest += stride;
- }
-}
-
-// 7.11.2.4 (9) angle > 180
-void DirectionalIntraPredictorZone3_SSE4_1(void* dest, ptrdiff_t stride,
- const void* const left_column,
- const int width, const int height,
- const int ystep,
- const bool upsampled) {
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_shift = static_cast<int>(upsampled);
- if (width == 4 || height == 4) {
- const ptrdiff_t stride4 = stride << 2;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<true>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_4x4<false>(dst_x, stride, left_ptr + y, left_y,
- ystep);
- dst_x += stride4;
- y += 4;
- } while (y < height);
- left_y += ystep << 2;
- x += 4;
- } while (x < width);
- }
- return;
- }
-
- const ptrdiff_t stride8 = stride << 3;
- if (upsampled) {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<true, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- } else {
- int left_y = ystep;
- int x = 0;
- do {
- uint8_t* dst_x = dst + x;
- int y = 0;
- do {
- DirectionalZone3_8xH<false, 8>(
- dst_x, stride, left_ptr + (y << upsample_shift), left_y, ystep);
- dst_x += stride8;
- y += 8;
- } while (y < height);
- left_y += ystep << 3;
- x += 8;
- } while (x < width);
- }
-}
-
-//------------------------------------------------------------------------------
-// Directional Zone 2 Functions
-// 7.11.2.4 (8)
-
-// DirectionalBlend* selectively overwrites the values written by
-// DirectionalZone2FromLeftCol*. |zone_bounds| has one 16-bit index for each
-// row.
-template <int y_selector>
-inline void DirectionalBlend4_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds) {
- const __m128i max_dest_x_vect = _mm_shufflelo_epi16(zone_bounds, y_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(Load4(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- Store4(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-inline void DirectionalBlend8_SSE4_1(uint8_t* dest,
- const __m128i& dest_index_vect,
- const __m128i& vals,
- const __m128i& zone_bounds,
- const __m128i& bounds_selector) {
- const __m128i max_dest_x_vect =
- _mm_shuffle_epi8(zone_bounds, bounds_selector);
- const __m128i use_left = _mm_cmplt_epi16(dest_index_vect, max_dest_x_vect);
- const __m128i original_vals = _mm_cvtepu8_epi16(LoadLo8(dest));
- const __m128i blended_vals = _mm_blendv_epi8(vals, original_vals, use_left);
- StoreLo8(dest, _mm_packus_epi16(blended_vals, blended_vals));
-}
-
-constexpr int kDirectionalWeightBits = 5;
-// |source| is packed with 4 or 8 pairs of 8-bit values from left or top.
-// |shifts| is named to match the specification, with 4 or 8 pairs of (32 -
-// shift) and shift. Shift is guaranteed to be between 0 and 32.
-inline __m128i DirectionalZone2FromSource_SSE4_1(const uint8_t* const source,
- const __m128i& shifts,
- const __m128i& sampler) {
- const __m128i src_vals = LoadUnaligned16(source);
- __m128i vals = _mm_shuffle_epi8(src_vals, sampler);
- vals = _mm_maddubs_epi16(vals, shifts);
- return RightShiftWithRounding_U16(vals, kDirectionalWeightBits);
-}
-
-// Because the source values "move backwards" as the row index increases, the
-// indices derived from ystep are generally negative. This is accommodated by
-// making sure the relative indices are within [-15, 0] when the function is
-// called, and sliding them into the inclusive range [0, 15], relative to a
-// lower base address.
-constexpr int kPositiveIndexOffset = 15;
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_4x4_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column_base,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_cvtsi32_si128(0x01010101);
- const __m128i positive_offset = _mm_set1_epi8(kPositiveIndexOffset);
- // Left_column and sampler are both offset by 15 so the indices are always
- // positive.
- const uint8_t* left_column = left_column_base - kPositiveIndexOffset;
- for (int y = 0; y < 4; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
-
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- // Slide valid |offset_y| indices from range [-15, 0] to [0, 15] so they
- // can work as shuffle indices. Some values may be out of bounds, but their
- // pred results will be masked over by top prediction.
- sampler = _mm_add_epi8(sampler, positive_offset);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column + (y << upsample_shift), shifts, sampler);
- Store4(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// The height at which a load of 16 bytes will not contain enough source pixels
-// from |left_column| to supply an accurate row when computing 8 pixels at a
-// time. The values are found by inspection. By coincidence, all angles that
-// satisfy (ystep >> 6) == 2 map to the same value, so it is enough to look up
-// by ystep >> 6. The largest index for this lookup is 1023 >> 6 == 15.
-constexpr int kDirectionalZone2ShuffleInvalidHeight[16] = {
- 1024, 1024, 16, 16, 16, 16, 0, 0, 18, 0, 0, 0, 0, 0, 0, 40};
-
-template <bool upsampled>
-inline void DirectionalZone2FromLeftCol_8x8_SSE4_1(
- uint8_t* dst, ptrdiff_t stride, const uint8_t* const left_column,
- __m128i left_y) {
- const int upsample_shift = static_cast<int>(upsampled);
- const int scale_bits = 6 - upsample_shift;
- const __m128i max_shifts = _mm_set1_epi8(32);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- const __m128i index_increment = _mm_set1_epi8(1);
- const __m128i denegation = _mm_set1_epi8(kPositiveIndexOffset);
- for (int y = 0; y < 8; dst += stride, ++y) {
- __m128i offset_y = _mm_srai_epi16(left_y, scale_bits);
- offset_y = _mm_packs_epi16(offset_y, offset_y);
- const __m128i adjacent = _mm_add_epi8(offset_y, index_increment);
-
- // Offset the relative index because ystep is negative in Zone 2 and shuffle
- // indices must be nonnegative.
- __m128i sampler = _mm_unpacklo_epi8(offset_y, adjacent);
- sampler = _mm_add_epi8(sampler, denegation);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(left_y, upsample_shift), shift_mask), 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shifts, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
-
- // The specification adds (y << 6) to left_y, which is subject to
- // upsampling, but this puts sampler indices out of the 0-15 range. It is
- // equivalent to offset the source address by (y << upsample_shift) instead.
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- left_column - kPositiveIndexOffset + (y << upsample_shift), shifts,
- sampler);
- StoreLo8(dst, _mm_packus_epi16(vals, vals));
- }
-}
-
-// |zone_bounds| is an epi16 of the relative x index at which base >= -(1 <<
-// upsampled_top), for each row. When there are 4 values, they can be duplicated
-// with a non-register shuffle mask.
-// |shifts| is one pair of weights that applies throughout a given row.
-template <bool upsampled_top>
-inline void DirectionalZone1Blend_4x4(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
- top_x -= xstep;
-
- int top_base_x = (top_x >> scale_bits_x);
- const __m128i vals0 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x00), sampler);
- DirectionalBlend4_SSE4_1<0x00>(dest, dest_index_x, vals0, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals1 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0x55), sampler);
- DirectionalBlend4_SSE4_1<0x55>(dest, dest_index_x, vals1, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals2 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xAA), sampler);
- DirectionalBlend4_SSE4_1<0xAA>(dest, dest_index_x, vals2, zone_bounds);
- top_x -= xstep;
- dest += stride;
-
- top_base_x = (top_x >> scale_bits_x);
- const __m128i vals3 = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shufflelo_epi16(shifts, 0xFF), sampler);
- DirectionalBlend4_SSE4_1<0xFF>(dest, dest_index_x, vals3, zone_bounds);
-}
-
-template <bool upsampled_top, int height>
-inline void DirectionalZone1Blend_8xH(
- uint8_t* dest, const uint8_t* const top_row, ptrdiff_t stride,
- __m128i sampler, const __m128i& zone_bounds, const __m128i& shifts,
- const __m128i& dest_index_x, int top_x, const int xstep) {
- const int upsample_shift = static_cast<int>(upsampled_top);
- const int scale_bits_x = 6 - upsample_shift;
-
- __m128i y_selector = _mm_set1_epi32(0x01000100);
- const __m128i index_increment = _mm_set1_epi32(0x02020202);
- for (int y = 0; y < height; ++y,
- y_selector = _mm_add_epi8(y_selector, index_increment),
- dest += stride) {
- top_x -= xstep;
- const int top_base_x = top_x >> scale_bits_x;
- const __m128i vals = DirectionalZone2FromSource_SSE4_1(
- top_row + top_base_x, _mm_shuffle_epi8(shifts, y_selector), sampler);
- DirectionalBlend8_SSE4_1(dest, dest_index_x, vals, zone_bounds, y_selector);
- }
-}
-
-// 7.11.2.4 (8) 90 < angle > 180
-// The strategy for this function is to know how many blocks can be processed
-// with just pixels from |top_ptr|, then handle mixed blocks, then handle only
-// blocks that take from |left_ptr|. Additionally, a fast index-shuffle
-// approach is used for pred values from |left_column| in sections that permit
-// it.
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride8 = stride << 3;
- const __m128i dest_index_x =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute. This assumes minimum |xstep| is 3.
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- // For steep angles, the source pixels from left_column may not fit in a
- // 16-byte load for shuffling.
- // TODO(petersonab): Find a more precise formula for this subject to x.
- const int max_shuffle_height =
- std::min(height, kDirectionalZone2ShuffleInvalidHeight[ystep >> 6]);
-
- const int xstep8 = xstep << 3;
- const __m128i xstep8_vect = _mm_set1_epi16(xstep8);
- // Accumulate xstep across 8 rows.
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
- const __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- const __m128i scaled_one = _mm_set1_epi16(-64);
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep8 = ystep << 3;
- const int left_base_increment8 = ystep8 >> 6;
- const int ystep_remainder8 = ystep8 & 0x3F;
- const __m128i increment_left8 = _mm_set1_epi16(-ystep_remainder8);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which is covered under the left_column
- // offset. Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
-
- const __m128i increment_top8 = _mm_set1_epi16(8 << 6);
- int x = 0;
-
- // This loop treats each set of 4 columns in 3 stages with y-value boundaries.
- // The first stage, before the first y-loop, covers blocks that are only
- // computed from the top row. The second stage, comprising two y-loops, covers
- // blocks that have a mixture of values computed from top or left. The final
- // stage covers blocks that are only computed from the left.
- for (int left_offset = -left_base_increment; x < min_top_only_x;
- x += 8,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top8),
- // Watch left_y because it can still get big.
- left_y = _mm_add_epi16(left_y, increment_left8),
- left_offset -= left_base_increment8) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min(((x + 1) << 6) / xstep, height) & ~7;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- DirectionalZone1_4xH(dst_x + 4, stride,
- top_row + ((x + 4) << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
-
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute.
- const int min_left_only_y = std::min(((x + 8) << 6) / xstep, height);
- // At high angles such that min_left_only_y < 8, ystep is low and xstep is
- // high. This means that max_shuffle_height is unbounded and xstep_bounds
- // will overflow in 16 bits. This is prevented by stopping the first
- // blending loop at min_left_only_y for such cases, which means we skip over
- // the second blending loop as well.
- const int left_shuffle_stop_y =
- std::min(max_shuffle_height, min_left_only_y);
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- for (; y < left_shuffle_stop_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- DirectionalZone2FromLeftCol_8x8_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Pick up from the last y-value, using the 10% slower but secure method for
- // left prediction.
- const auto base_left_y = static_cast<int16_t>(_mm_extract_epi16(left_y, 0));
- for (; y < min_left_only_y;
- y += 8, dst_x += stride8,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep8_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep8_vect),
- top_x -= xstep8) {
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
-
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- DirectionalZone1Blend_8xH<upsampled_top, 8>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left_only rows.
- for (; y < height; y += 8, dst_x += stride8) {
- DirectionalZone3_8xH<upsampled_left, 8>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), base_left_y,
- -ystep);
- }
- }
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-template <bool upsampled_left, bool upsampled_top>
-inline void DirectionalZone2_4_SSE4_1(void* dest, ptrdiff_t stride,
- const uint8_t* const top_row,
- const uint8_t* const left_column,
- const int width, const int height,
- const int xstep, const int ystep) {
- auto* dst = static_cast<uint8_t*>(dest);
- const int upsample_left_shift = static_cast<int>(upsampled_left);
- const int upsample_top_shift = static_cast<int>(upsampled_top);
- const __m128i max_shift = _mm_set1_epi8(32);
- const ptrdiff_t stride4 = stride << 2;
- const __m128i dest_index_x = _mm_set_epi32(0, 0, 0x00030002, 0x00010000);
- const __m128i sampler_top =
- upsampled_top
- ? _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100)
- : _mm_set_epi32(0x08070706, 0x06050504, 0x04030302, 0x02010100);
- // All columns from |min_top_only_x| to the right will only need |top_row| to
- // compute.
- assert(xstep >= 3);
- const int min_top_only_x = std::min((height * xstep) >> 6, width);
-
- const int xstep4 = xstep << 2;
- const __m128i xstep4_vect = _mm_set1_epi16(xstep4);
- const __m128i xstep_dup = _mm_set1_epi16(-xstep);
- const __m128i increments = _mm_set_epi32(0, 0, 0x00040003, 0x00020001);
- __m128i xstep_for_shift = _mm_mullo_epi16(xstep_dup, increments);
- const __m128i scaled_one = _mm_set1_epi16(-64);
- // Offsets the original zone bound value to simplify x < (y+1)*xstep/64 -1
- __m128i xstep_bounds_base =
- (xstep == 64) ? _mm_sub_epi16(scaled_one, xstep_for_shift)
- : _mm_sub_epi16(_mm_set1_epi16(-1), xstep_for_shift);
-
- const int left_base_increment = ystep >> 6;
- const int ystep_remainder = ystep & 0x3F;
- const int ystep4 = ystep << 2;
- const int left_base_increment4 = ystep4 >> 6;
- // This is guaranteed to be less than 64, but accumulation may bring it past
- // 64 for higher x values.
- const int ystep_remainder4 = ystep4 & 0x3F;
- const __m128i increment_left4 = _mm_set1_epi16(-ystep_remainder4);
- const __m128i increment_top4 = _mm_set1_epi16(4 << 6);
-
- // If the 64 scaling is regarded as a decimal point, the first value of the
- // left_y vector omits the portion which will go into the left_column offset.
- // Following values need the full ystep as a relative offset.
- const __m128i ystep_init = _mm_set1_epi16(-ystep_remainder);
- const __m128i ystep_dup = _mm_set1_epi16(-ystep);
- __m128i left_y = _mm_mullo_epi16(ystep_dup, dest_index_x);
- left_y = _mm_add_epi16(ystep_init, left_y);
- const __m128i shift_mask = _mm_set1_epi32(0x003F003F);
-
- int x = 0;
- // Loop over x for columns with a mixture of sources.
- for (int left_offset = -left_base_increment; x < min_top_only_x; x += 4,
- xstep_bounds_base = _mm_sub_epi16(xstep_bounds_base, increment_top4),
- left_y = _mm_add_epi16(left_y, increment_left4),
- left_offset -= left_base_increment4) {
- uint8_t* dst_x = dst + x;
-
- // Round down to the nearest multiple of 8.
- const int max_top_only_y = std::min((x << 6) / xstep, height) & 0xFFFFFFF4;
- DirectionalZone1_4xH(dst_x, stride, top_row + (x << upsample_top_shift),
- max_top_only_y, -xstep, upsampled_top);
- int y = max_top_only_y;
- dst_x += stride * y;
- const int xstep_y = xstep * y;
- const __m128i xstep_y_vect = _mm_set1_epi16(xstep_y);
- // All rows from |min_left_only_y| down for this set of columns, only need
- // |left_column| to compute. Rounded up to the nearest multiple of 4.
- const int min_left_only_y = std::min(((x + 4) << 6) / xstep, height);
-
- __m128i xstep_bounds = _mm_add_epi16(xstep_bounds_base, xstep_y_vect);
- __m128i xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift, xstep_y_vect);
- int top_x = -xstep_y;
-
- // Loop over y for mixed rows.
- for (; y < min_left_only_y;
- y += 4, dst_x += stride4,
- xstep_bounds = _mm_add_epi16(xstep_bounds, xstep4_vect),
- xstep_for_shift_y = _mm_sub_epi16(xstep_for_shift_y, xstep4_vect),
- top_x -= xstep4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) * (1 << upsample_left_shift)),
- left_y);
-
- __m128i shifts = _mm_srli_epi16(
- _mm_and_si128(_mm_slli_epi16(xstep_for_shift_y, upsample_top_shift),
- shift_mask),
- 1);
- shifts = _mm_packus_epi16(shifts, shifts);
- const __m128i opposite_shifts = _mm_sub_epi8(max_shift, shifts);
- shifts = _mm_unpacklo_epi8(opposite_shifts, shifts);
- const __m128i xstep_bounds_off = _mm_srai_epi16(xstep_bounds, 6);
- DirectionalZone1Blend_4x4<upsampled_top>(
- dst_x, top_row + (x << upsample_top_shift), stride, sampler_top,
- xstep_bounds_off, shifts, dest_index_x, top_x, xstep);
- }
- // Loop over y for left-only rows, if any.
- for (; y < height; y += 4, dst_x += stride4) {
- DirectionalZone2FromLeftCol_4x4_SSE4_1<upsampled_left>(
- dst_x, stride,
- left_column + ((left_offset + y) << upsample_left_shift), left_y);
- }
- }
- // Loop over top-only columns, if any.
- for (; x < width; x += 4) {
- DirectionalZone1_4xH(dst + x, stride, top_row + (x << upsample_top_shift),
- height, -xstep, upsampled_top);
- }
-}
-
-void DirectionalIntraPredictorZone2_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- const int width, const int height,
- const int xstep, const int ystep,
- const bool upsampled_top,
- const bool upsampled_left) {
- // Increasing the negative buffer for this function allows more rows to be
- // processed at a time without branching in an inner loop to check the base.
- uint8_t top_buffer[288];
- uint8_t left_buffer[288];
- memcpy(top_buffer + 128, static_cast<const uint8_t*>(top_row) - 16, 160);
- memcpy(left_buffer + 128, static_cast<const uint8_t*>(left_column) - 16, 160);
- const uint8_t* top_ptr = top_buffer + 144;
- const uint8_t* left_ptr = left_buffer + 144;
- if (width == 4 || height == 4) {
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_4_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_4_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
- return;
- }
- if (upsampled_left) {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<true, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<true, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- } else {
- if (upsampled_top) {
- DirectionalZone2_SSE4_1<false, true>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- } else {
- DirectionalZone2_SSE4_1<false, false>(dest, stride, top_ptr, left_ptr,
- width, height, xstep, ystep);
- }
- }
-}
-
-//------------------------------------------------------------------------------
-// FilterIntraPredictor_SSE4_1
-
-// Apply all filter taps to the given 7 packed 16-bit values, keeping the 8th
-// at zero to preserve the sum.
-inline void Filter4x2_SSE4_1(uint8_t* dst, const ptrdiff_t stride,
- const __m128i& pixels, const __m128i& taps_0_1,
- const __m128i& taps_2_3, const __m128i& taps_4_5,
- const __m128i& taps_6_7) {
- const __m128i mul_0_01 = _mm_maddubs_epi16(pixels, taps_0_1);
- const __m128i mul_0_23 = _mm_maddubs_epi16(pixels, taps_2_3);
- // |output_half| contains 8 partial sums.
- __m128i output_half = _mm_hadd_epi16(mul_0_01, mul_0_23);
- __m128i output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row0 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst, output_row0);
- const __m128i mul_1_01 = _mm_maddubs_epi16(pixels, taps_4_5);
- const __m128i mul_1_23 = _mm_maddubs_epi16(pixels, taps_6_7);
- output_half = _mm_hadd_epi16(mul_1_01, mul_1_23);
- output = _mm_hadd_epi16(output_half, output_half);
- const __m128i output_row1 =
- _mm_packus_epi16(RightShiftWithRounding_S16(output, 4),
- /* arbitrary pack arg */ output);
- Store4(dst + stride, output_row1);
-}
-
-// 4xH transform sizes are given special treatment because LoadLo8 goes out
-// of bounds and every block involves the left column. This implementation
-// loads TL from the top row for the first block, so it is not
-inline void Filter4xH(uint8_t* dest, ptrdiff_t stride,
- const uint8_t* const top_ptr,
- const uint8_t* const left_ptr, FilterIntraPredictor pred,
- const int height) {
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
- __m128i top = Load4(top_ptr - 1);
- __m128i pixels = _mm_insert_epi8(top, top_ptr[3], 4);
- __m128i left = (height == 4 ? Load4(left_ptr) : LoadLo8(left_ptr));
- left = _mm_slli_si128(left, 5);
-
- // Relative pixels: top[-1], top[0], top[1], top[2], top[3], left[0], left[1],
- // left[2], left[3], left[4], left[5], left[6], left[7]
- pixels = _mm_or_si128(left, pixels);
-
- // Duplicate first 8 bytes.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 1.
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, left[-2], left[-1],
- // left[0], left[1], ...
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 6, 0, 1, 2, 3, 7, 8, 15. The last
- // byte is an unused value, which shall be multiplied by 0 when we apply the
- // filter.
- constexpr int64_t kInsertTopLeftFirstMask = 0x0F08070302010006;
-
- // Insert left[-1] in front as TL and put left[0] and left[1] at the end.
- const __m128i pixel_order1 = _mm_set1_epi64x(kInsertTopLeftFirstMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 2.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 3.
-
- // Compute the middle 8 rows before using common code for the final 4 rows.
- // Because the common code below this block assumes that
- if (height == 16) {
- // This shift allows us to use pixel_order2 twice after shifting by 2 later.
- left = _mm_slli_si128(left, 1);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty, left[-4],
- // left[-3], left[-2], left[-1], left[0], left[1], left[2], left[3]
- pixels = _mm_or_si128(left, pixels);
-
- // This mask rearranges bytes in the order: 9, 0, 1, 2, 3, 7, 8, 15. The
- // last byte is an unused value, as above. The top-left was shifted to
- // position nine to keep two empty spaces after the top pixels.
- constexpr int64_t kInsertTopLeftSecondMask = 0x0F0B0A0302010009;
-
- // Insert (relative) left[-1] in front as TL and put left[0] and left[1] at
- // the end.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftSecondMask);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 4.
-
- // First 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Clear all but final pixel in the first 8 of left column.
- __m128i keep_top_left = _mm_srli_si128(left, 13);
- dest += stride; // Move to y = 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-6],
- // left[-5], left[-4], left[-3], left[-2], left[-1], left[0], left[1]
- pixels = _mm_or_si128(left, pixels);
- left = LoadLo8(left_ptr + 8);
-
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- dest += stride; // Move to y = 6.
-
- // Second 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // Position TL value so we can use pixel_order1.
- keep_top_left = _mm_slli_si128(keep_top_left, 6);
- dest += stride; // Move to y = 7.
- pixels = Load4(dest);
- left = _mm_slli_si128(left, 7);
- left = _mm_or_si128(left, keep_top_left);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 8.
-
- // Third 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 9.
-
- // Prepare final inputs.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 10.
-
- // Fourth 4x2 in the if body.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 11.
- }
-
- // In both the 8 and 16 case, we assume that the left vector has the next TL
- // at position 8.
- if (height > 4) {
- // Erase prior left pixels by shifting TL to position 0.
- left = _mm_srli_si128(left, 8);
- left = _mm_slli_si128(left, 6);
- pixels = Load4(dest);
-
- // Relative pixels: top[0], top[1], top[2], top[3], empty, empty,
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 12 or 4.
-
- // First of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dest += stride; // Move to y = 13 or 5.
- pixels = Load4(dest);
- left = _mm_srli_si128(left, 2);
-
- // Relative pixels: top[0], top[1], top[2], top[3], left[-3], left[-2]
- // left[-1], left[0], left[1], left[2], left[3], ...
- pixels = _mm_or_si128(left, pixels);
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- dest += stride; // Move to y = 14 or 6.
-
- // Last of final two 4x2 blocks.
- Filter4x2_SSE4_1(dest, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- }
-}
-
-void FilterIntraPredictor_SSE4_1(void* const dest, ptrdiff_t stride,
- const void* const top_row,
- const void* const left_column,
- FilterIntraPredictor pred, const int width,
- const int height) {
- const auto* const top_ptr = static_cast<const uint8_t*>(top_row);
- const auto* const left_ptr = static_cast<const uint8_t*>(left_column);
- auto* dst = static_cast<uint8_t*>(dest);
- if (width == 4) {
- Filter4xH(dst, stride, top_ptr, left_ptr, pred, height);
- return;
- }
-
- // There is one set of 7 taps for each of the 4x2 output pixels.
- const __m128i taps_0_1 = LoadUnaligned16(kFilterIntraTaps[pred][0]);
- const __m128i taps_2_3 = LoadUnaligned16(kFilterIntraTaps[pred][2]);
- const __m128i taps_4_5 = LoadUnaligned16(kFilterIntraTaps[pred][4]);
- const __m128i taps_6_7 = LoadUnaligned16(kFilterIntraTaps[pred][6]);
-
- // This mask rearranges bytes in the order: 0, 1, 2, 3, 4, 8, 9, 15. The 15 at
- // the end is an unused value, which shall be multiplied by 0 when we apply
- // the filter.
- constexpr int64_t kCondenseLeftMask = 0x0F09080403020100;
-
- // Takes the "left section" and puts it right after p0-p4.
- const __m128i pixel_order1 = _mm_set1_epi64x(kCondenseLeftMask);
-
- // This mask rearranges bytes in the order: 8, 0, 1, 2, 3, 9, 10, 15. The last
- // byte is unused as above.
- constexpr int64_t kInsertTopLeftMask = 0x0F0A090302010008;
-
- // Shuffles the "top left" from the left section, to the front. Used when
- // grabbing data from left_column and not top_row.
- const __m128i pixel_order2 = _mm_set1_epi64x(kInsertTopLeftMask);
-
- // This first pass takes care of the cases where the top left pixel comes from
- // top_row.
- __m128i pixels = LoadLo8(top_ptr - 1);
- __m128i left = _mm_slli_si128(Load4(left_column), 8);
- pixels = _mm_or_si128(pixels, left);
-
- // Two sets of the same pixels to multiply with two sets of taps.
- pixels = _mm_shuffle_epi8(pixels, pixel_order1);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5, taps_6_7);
- left = _mm_srli_si128(left, 1);
-
- // Load
- pixels = Load4(dst + stride);
-
- // Because of the above shift, this OR 'invades' the final of the first 8
- // bytes of |pixels|. This is acceptable because the 8th filter tap is always
- // a padded 0.
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- const ptrdiff_t stride2 = stride << 1;
- const ptrdiff_t stride4 = stride << 2;
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- dst += 4;
- for (int x = 3; x < width - 4; x += 4) {
- pixels = Load4(top_ptr + x);
- pixels = _mm_insert_epi8(pixels, top_ptr[x + 4], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride + stride2 - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
-
- // Now we handle heights that reference previous blocks rather than top_row.
- for (int y = 4; y < height; y += 4) {
- // Leftmost 4x4 block for this height.
- dst -= width;
- dst += stride4;
-
- // Top Left is not available by offset in these leftmost blocks.
- pixels = Load4(dst - stride);
- left = _mm_slli_si128(Load4(left_ptr + y - 1), 8);
- left = _mm_insert_epi8(left, left_ptr[y + 3], 12);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
-
- // The bytes shifted into positions 6 and 7 will be ignored by the shuffle.
- left = _mm_srli_si128(left, 2);
- pixels = Load4(dst + stride);
- pixels = _mm_or_si128(pixels, left);
- pixels = _mm_shuffle_epi8(pixels, pixel_order2);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
-
- dst += 4;
-
- // Remaining 4x4 blocks for this height.
- for (int x = 4; x < width; x += 4) {
- pixels = Load4(dst - stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[-stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[-1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst, stride, pixels, taps_0_1, taps_2_3, taps_4_5,
- taps_6_7);
- pixels = Load4(dst + stride - 1);
- pixels = _mm_insert_epi8(pixels, dst[stride + 3], 4);
- pixels = _mm_insert_epi8(pixels, dst[stride2 - 1], 5);
- pixels = _mm_insert_epi8(pixels, dst[stride2 + stride - 1], 6);
-
- // Duplicate bottom half into upper half.
- pixels = _mm_shuffle_epi32(pixels, kDuplicateFirstHalf);
- Filter4x2_SSE4_1(dst + stride2, stride, pixels, taps_0_1, taps_2_3,
- taps_4_5, taps_6_7);
- dst += 4;
- }
- }
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
@@ -2746,21 +1412,6 @@
// These guards check if this version of the function was not superseded by
// a higher optimization level, such as AVX. The corresponding #define also
// prevents the C version from being added to the table.
-#if DSP_ENABLED_8BPP_SSE4_1(FilterIntraPredictor)
- dsp->filter_intra_predictor = FilterIntraPredictor_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone1)
- dsp->directional_intra_predictor_zone1 =
- DirectionalIntraPredictorZone1_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone2)
- dsp->directional_intra_predictor_zone2 =
- DirectionalIntraPredictorZone2_SSE4_1;
-#endif
-#if DSP_ENABLED_8BPP_SSE4_1(DirectionalIntraPredictorZone3)
- dsp->directional_intra_predictor_zone3 =
- DirectionalIntraPredictorZone3_SSE4_1;
-#endif
#if DSP_ENABLED_8BPP_SSE4_1(TransformSize4x4_IntraPredictorDcTop)
dsp->intra_predictors[kTransformSize4x4][kIntraPredictorDcTop] =
DcDefs::_4x4::DcTop;
@@ -3524,7 +2175,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -3532,4 +2183,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/intrapred_sse4.h b/libgav1/src/dsp/x86/intrapred_sse4.h
index eb3825d..1f6f30a 100644
--- a/libgav1/src/dsp/x86/intrapred_sse4.h
+++ b/libgav1/src/dsp/x86/intrapred_sse4.h
@@ -23,36 +23,16 @@
namespace libgav1 {
namespace dsp {
-// Initializes Dsp::intra_predictors, Dsp::directional_intra_predictor_zone*,
-// Dsp::cfl_intra_predictors, Dsp::cfl_subsamplers and
-// Dsp::filter_intra_predictor, see the defines below for specifics. These
-// functions are not thread-safe.
+// Initializes Dsp::intra_predictors. See the defines below for specifics.
+// These functions are not thread-safe.
void IntraPredInit_SSE4_1();
-void IntraPredCflInit_SSE4_1();
-void IntraPredSmoothInit_SSE4_1();
} // namespace dsp
} // namespace libgav1
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
-#ifndef LIBGAV1_Dsp8bpp_FilterIntraPredictor
-#define LIBGAV1_Dsp8bpp_FilterIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone1 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone2 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3
-#define LIBGAV1_Dsp8bpp_DirectionalIntraPredictorZone3 LIBGAV1_CPU_SSE4_1
-#endif
-
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcTop LIBGAV1_CPU_SSE4_1
#endif
@@ -138,174 +118,6 @@
LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler420 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflSubsampler444 LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_CflIntraPredictor LIBGAV1_CPU_SSE4_1
-#endif
-
#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft
#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorDcLeft LIBGAV1_CPU_SSE4_1
#endif
@@ -658,287 +470,6 @@
LIBGAV1_CPU_SSE4_1
#endif
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmooth LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmooth \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothVertical \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize4x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize8x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x4_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize16x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x8_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize32x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x16_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x32_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
-#ifndef LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal
-#define LIBGAV1_Dsp8bpp_TransformSize64x64_IntraPredictorSmoothHorizontal \
- LIBGAV1_CPU_SSE4_1
-#endif
-
//------------------------------------------------------------------------------
// 10bpp
@@ -1055,6 +586,6 @@
LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_INTRAPRED_SSE4_H_
diff --git a/libgav1/src/dsp/x86/inverse_transform_sse4.cc b/libgav1/src/dsp/x86/inverse_transform_sse4.cc
index 30ad436..12c008f 100644
--- a/libgav1/src/dsp/x86/inverse_transform_sse4.cc
+++ b/libgav1/src/dsp/x86/inverse_transform_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/inverse_transform.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -94,8 +94,7 @@
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
const __m128i ab = _mm_unpacklo_epi16(*b, *a);
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i x0 = _mm_madd_epi16(ba, msin_pcos);
@@ -121,8 +120,7 @@
const int16_t sin128 = Sin128(angle);
const __m128i psin_pcos = _mm_set1_epi32(
static_cast<uint16_t>(cos128) | (static_cast<uint32_t>(sin128) << 16));
- const __m128i sign =
- _mm_set_epi32(0x80000001, 0x80000001, 0x80000001, 0x80000001);
+ const __m128i sign = _mm_set1_epi32(static_cast<int>(0x80000001));
// -sin cos, -sin cos, -sin cos, -sin cos
const __m128i msin_pcos = _mm_sign_epi16(psin_pcos, sign);
const __m128i ba = _mm_unpacklo_epi16(*a, *b);
@@ -221,20 +219,16 @@
// Discrete Cosine Transforms (DCT).
template <int width>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
+ const __m128i v_src_lo = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
const __m128i v_src =
(width == 4) ? v_src_lo : _mm_shuffle_epi32(v_src_lo, 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -266,26 +260,22 @@
}
template <int height>
-LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool DctDcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
const int16_t cos128 = Cos128(32);
// Calculate dc values for first row.
if (width == 4) {
- const __m128i v_src = LoadLo8(src);
+ const __m128i v_src = LoadLo8(dst);
const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
StoreLo8(dst, xy);
} else {
int i = 0;
do {
- const __m128i v_src = LoadUnaligned16(&src[i]);
+ const __m128i v_src = LoadUnaligned16(&dst[i]);
const __m128i xy = _mm_mulhrs_epi16(v_src, _mm_set1_epi16(cos128 << 3));
StoreUnaligned16(&dst[i], xy);
i += 8;
@@ -294,21 +284,21 @@
// Copy first row to the rest of the block.
for (int y = 1; y < height; ++y) {
- memcpy(&dst[y * width], &src[(y - 1) * width], width * sizeof(dst[0]));
+ memcpy(&dst[y * width], dst, width * sizeof(dst[0]));
}
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct4Stages(__m128i* s) {
// stage 12.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[0], &s[1], 32, true);
ButterflyRotation_SecondIsZero(&s[2], &s[3], 48, false);
} else {
- bufferfly_rotation(&s[0], &s[1], 32, true);
- bufferfly_rotation(&s[2], &s[3], 48, false);
+ butterfly_rotation(&s[0], &s[1], 32, true);
+ butterfly_rotation(&s[2], &s[3], 48, false);
}
// stage 17.
@@ -317,23 +307,22 @@
}
// Process 4 dct4 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[4], x[4];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[8];
- LoadSrc<8, 8>(src, step, 0, input);
+ LoadSrc<8, 8>(dst, step, 0, input);
Transpose4x8To8x4_U16(input, x);
} else {
- LoadSrc<16, 4>(src, step, 0, x);
+ LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(src, step, 0, x);
+ LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
Transpose4x4_U16(x, x);
}
@@ -345,7 +334,7 @@
s[2] = x[1];
s[3] = x[3];
- Dct4Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
if (stage_is_rectangular) {
if (transpose) {
@@ -363,16 +352,16 @@
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct8Stages(__m128i* s) {
// stage 8.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[4], &s[7], 56, false);
ButterflyRotation_FirstIsZero(&s[5], &s[6], 24, false);
} else {
- bufferfly_rotation(&s[4], &s[7], 56, false);
- bufferfly_rotation(&s[5], &s[6], 24, false);
+ butterfly_rotation(&s[4], &s[7], 56, false);
+ butterfly_rotation(&s[5], &s[6], 24, false);
}
// stage 13.
@@ -380,7 +369,7 @@
HadamardRotation(&s[6], &s[7], true);
// stage 18.
- bufferfly_rotation(&s[6], &s[5], 32, true);
+ butterfly_rotation(&s[6], &s[5], 32, true);
// stage 22.
HadamardRotation(&s[0], &s[7], false);
@@ -390,28 +379,27 @@
}
// Process dct8 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[8], x[8];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8_U16(input, x);
} else {
- LoadSrc<8, 8>(src, step, 0, x);
+ LoadSrc<8, 8>(dst, step, 0, x);
}
} else {
if (transpose) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, 0, input);
+ LoadSrc<16, 8>(dst, step, 0, input);
Transpose8x8_U16(input, x);
} else {
- LoadSrc<16, 8>(src, step, 0, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
}
}
@@ -426,8 +414,8 @@
s[6] = x[3];
s[7] = x[7];
- Dct4Stages<bufferfly_rotation>(s);
- Dct8Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
if (stage_is_rectangular) {
if (transpose) {
@@ -448,20 +436,20 @@
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
- bool is_fast_bufferfly = false>
+template <ButterflyRotationFunc butterfly_rotation,
+ bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct16Stages(__m128i* s) {
// stage 5.
- if (is_fast_bufferfly) {
+ if (is_fast_butterfly) {
ButterflyRotation_SecondIsZero(&s[8], &s[15], 60, false);
ButterflyRotation_FirstIsZero(&s[9], &s[14], 28, false);
ButterflyRotation_SecondIsZero(&s[10], &s[13], 44, false);
ButterflyRotation_FirstIsZero(&s[11], &s[12], 12, false);
} else {
- bufferfly_rotation(&s[8], &s[15], 60, false);
- bufferfly_rotation(&s[9], &s[14], 28, false);
- bufferfly_rotation(&s[10], &s[13], 44, false);
- bufferfly_rotation(&s[11], &s[12], 12, false);
+ butterfly_rotation(&s[8], &s[15], 60, false);
+ butterfly_rotation(&s[9], &s[14], 28, false);
+ butterfly_rotation(&s[10], &s[13], 44, false);
+ butterfly_rotation(&s[11], &s[12], 12, false);
}
// stage 9.
@@ -471,8 +459,8 @@
HadamardRotation(&s[14], &s[15], true);
// stage 14.
- bufferfly_rotation(&s[14], &s[9], 48, true);
- bufferfly_rotation(&s[13], &s[10], 112, true);
+ butterfly_rotation(&s[14], &s[9], 48, true);
+ butterfly_rotation(&s[13], &s[10], 112, true);
// stage 19.
HadamardRotation(&s[8], &s[11], false);
@@ -481,8 +469,8 @@
HadamardRotation(&s[13], &s[14], true);
// stage 23.
- bufferfly_rotation(&s[13], &s[10], 32, true);
- bufferfly_rotation(&s[12], &s[11], 32, true);
+ butterfly_rotation(&s[13], &s[10], 32, true);
+ butterfly_rotation(&s[12], &s[11], 32, true);
// stage 26.
HadamardRotation(&s[0], &s[15], false);
@@ -496,32 +484,31 @@
}
// Process dct16 rows or columns, depending on the transpose flag.
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Dct16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[16], x[16];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8_U16(input, x);
- LoadSrc<16, 4>(src, step, 8, input);
+ LoadSrc<16, 4>(dst, step, 8, input);
Transpose8x4To4x8_U16(input, &x[8]);
} else {
- LoadSrc<8, 16>(src, step, 0, x);
+ LoadSrc<8, 16>(dst, step, 0, x);
}
} else {
if (transpose) {
for (int idx = 0; idx < 16; idx += 8) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, idx, input);
+ LoadSrc<16, 8>(dst, step, idx, input);
Transpose8x8_U16(input, &x[idx]);
}
} else {
- LoadSrc<16, 16>(src, step, 0, x);
+ LoadSrc<16, 16>(dst, step, 0, x);
}
}
@@ -544,9 +531,9 @@
s[14] = x[7];
s[15] = x[15];
- Dct4Stages<bufferfly_rotation>(s);
- Dct8Stages<bufferfly_rotation>(s);
- Dct16Stages<bufferfly_rotation>(s);
+ Dct4Stages<butterfly_rotation>(s);
+ Dct8Stages<butterfly_rotation>(s);
+ Dct16Stages<butterfly_rotation>(s);
if (stage_is_rectangular) {
if (transpose) {
@@ -571,7 +558,7 @@
}
}
-template <ButterflyRotationFunc bufferfly_rotation,
+template <ButterflyRotationFunc butterfly_rotation,
bool is_fast_butterfly = false>
LIBGAV1_ALWAYS_INLINE void Dct32Stages(__m128i* s) {
// stage 3
@@ -585,14 +572,14 @@
ButterflyRotation_SecondIsZero(&s[22], &s[25], 38, false);
ButterflyRotation_FirstIsZero(&s[23], &s[24], 6, false);
} else {
- bufferfly_rotation(&s[16], &s[31], 62, false);
- bufferfly_rotation(&s[17], &s[30], 30, false);
- bufferfly_rotation(&s[18], &s[29], 46, false);
- bufferfly_rotation(&s[19], &s[28], 14, false);
- bufferfly_rotation(&s[20], &s[27], 54, false);
- bufferfly_rotation(&s[21], &s[26], 22, false);
- bufferfly_rotation(&s[22], &s[25], 38, false);
- bufferfly_rotation(&s[23], &s[24], 6, false);
+ butterfly_rotation(&s[16], &s[31], 62, false);
+ butterfly_rotation(&s[17], &s[30], 30, false);
+ butterfly_rotation(&s[18], &s[29], 46, false);
+ butterfly_rotation(&s[19], &s[28], 14, false);
+ butterfly_rotation(&s[20], &s[27], 54, false);
+ butterfly_rotation(&s[21], &s[26], 22, false);
+ butterfly_rotation(&s[22], &s[25], 38, false);
+ butterfly_rotation(&s[23], &s[24], 6, false);
}
// stage 6.
HadamardRotation(&s[16], &s[17], false);
@@ -605,10 +592,10 @@
HadamardRotation(&s[30], &s[31], true);
// stage 10.
- bufferfly_rotation(&s[30], &s[17], 24 + 32, true);
- bufferfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
- bufferfly_rotation(&s[26], &s[21], 24, true);
- bufferfly_rotation(&s[25], &s[22], 24 + 64, true);
+ butterfly_rotation(&s[30], &s[17], 24 + 32, true);
+ butterfly_rotation(&s[29], &s[18], 24 + 64 + 32, true);
+ butterfly_rotation(&s[26], &s[21], 24, true);
+ butterfly_rotation(&s[25], &s[22], 24 + 64, true);
// stage 15.
HadamardRotation(&s[16], &s[19], false);
@@ -621,10 +608,10 @@
HadamardRotation(&s[29], &s[30], true);
// stage 20.
- bufferfly_rotation(&s[29], &s[18], 48, true);
- bufferfly_rotation(&s[28], &s[19], 48, true);
- bufferfly_rotation(&s[27], &s[20], 48 + 64, true);
- bufferfly_rotation(&s[26], &s[21], 48 + 64, true);
+ butterfly_rotation(&s[29], &s[18], 48, true);
+ butterfly_rotation(&s[28], &s[19], 48, true);
+ butterfly_rotation(&s[27], &s[20], 48 + 64, true);
+ butterfly_rotation(&s[26], &s[21], 48 + 64, true);
// stage 24.
HadamardRotation(&s[16], &s[23], false);
@@ -637,10 +624,10 @@
HadamardRotation(&s[27], &s[28], true);
// stage 27.
- bufferfly_rotation(&s[27], &s[20], 32, true);
- bufferfly_rotation(&s[26], &s[21], 32, true);
- bufferfly_rotation(&s[25], &s[22], 32, true);
- bufferfly_rotation(&s[24], &s[23], 32, true);
+ butterfly_rotation(&s[27], &s[20], 32, true);
+ butterfly_rotation(&s[26], &s[21], 32, true);
+ butterfly_rotation(&s[25], &s[22], 32, true);
+ butterfly_rotation(&s[24], &s[23], 32, true);
// stage 29.
HadamardRotation(&s[0], &s[31], false);
@@ -662,21 +649,19 @@
}
// Process dct32 rows or columns, depending on the transpose flag.
-LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const void* source,
- const int32_t step,
+LIBGAV1_ALWAYS_INLINE void Dct32_SSE4_1(void* dest, const int32_t step,
const bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[32], x[32];
if (transpose) {
for (int idx = 0; idx < 32; idx += 8) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, idx, input);
+ LoadSrc<16, 8>(dst, step, idx, input);
Transpose8x8_U16(input, &x[idx]);
}
} else {
- LoadSrc<16, 32>(src, step, 0, x);
+ LoadSrc<16, 32>(dst, step, 0, x);
}
// stage 1
@@ -735,10 +720,8 @@
// Allow the compiler to call this function instead of force inlining. Tests
// show the performance is slightly faster.
-void Dct64_SSE4_1(void* dest, const void* source, int32_t step,
- bool transpose) {
+void Dct64_SSE4_1(void* dest, int32_t step, bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[64], x[32];
if (transpose) {
@@ -746,13 +729,13 @@
// 64.
for (int idx = 0; idx < 32; idx += 8) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, idx, input);
+ LoadSrc<16, 8>(dst, step, idx, input);
Transpose8x8_U16(input, &x[idx]);
}
} else {
// The last 32 values of every column are always zero if the |tx_height| is
// 64.
- LoadSrc<16, 32>(src, step, 0, x);
+ LoadSrc<16, 32>(dst, step, 0, x);
}
// stage 1
@@ -957,22 +940,21 @@
// Asymmetric Discrete Sine Transforms (ADST).
template <bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+LIBGAV1_ALWAYS_INLINE void Adst4_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[8], x[4];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[8];
- LoadSrc<8, 8>(src, step, 0, input);
+ LoadSrc<8, 8>(dst, step, 0, input);
Transpose4x8To8x4_U16(input, x);
} else {
- LoadSrc<16, 4>(src, step, 0, x);
+ LoadSrc<16, 4>(dst, step, 0, x);
}
} else {
- LoadSrc<8, 4>(src, step, 0, x);
+ LoadSrc<8, 4>(dst, step, 0, x);
if (transpose) {
Transpose4x4_U16(x, x);
}
@@ -1049,18 +1031,15 @@
constexpr int16_t kAdst4DcOnlyMultiplier[8] = {1321, 0, 2482, 0,
3344, 0, 2482, 1321};
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
const __m128i v_src =
- _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ _mm_shuffle_epi32(_mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1083,19 +1062,14 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst4DcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
int i = 0;
do {
- const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&src[i]));
+ const __m128i v_src = _mm_cvtepi16_epi32(LoadLo8(&dst[i]));
const __m128i kAdst4Multiplier_0 = _mm_set1_epi32(kAdst4Multiplier[0]);
const __m128i kAdst4Multiplier_1 = _mm_set1_epi32(kAdst4Multiplier[1]);
const __m128i kAdst4Multiplier_2 = _mm_set1_epi32(kAdst4Multiplier[2]);
@@ -1122,28 +1096,27 @@
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst8_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[8], x[8];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8_U16(input, x);
} else {
- LoadSrc<8, 8>(src, step, 0, x);
+ LoadSrc<8, 8>(dst, step, 0, x);
}
} else {
if (transpose) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, 0, input);
+ LoadSrc<16, 8>(dst, step, 0, input);
Transpose8x8_U16(input, x);
} else {
- LoadSrc<16, 8>(src, step, 0, x);
+ LoadSrc<16, 8>(dst, step, 0, x);
}
}
@@ -1158,10 +1131,10 @@
s[7] = x[6];
// stage 2.
- bufferfly_rotation(&s[0], &s[1], 60 - 0, true);
- bufferfly_rotation(&s[2], &s[3], 60 - 16, true);
- bufferfly_rotation(&s[4], &s[5], 60 - 32, true);
- bufferfly_rotation(&s[6], &s[7], 60 - 48, true);
+ butterfly_rotation(&s[0], &s[1], 60 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 60 - 16, true);
+ butterfly_rotation(&s[4], &s[5], 60 - 32, true);
+ butterfly_rotation(&s[6], &s[7], 60 - 48, true);
// stage 3.
HadamardRotation(&s[0], &s[4], false);
@@ -1170,8 +1143,8 @@
HadamardRotation(&s[3], &s[7], false);
// stage 4.
- bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
- bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
// stage 5.
HadamardRotation(&s[0], &s[2], false);
@@ -1180,8 +1153,8 @@
HadamardRotation(&s[5], &s[7], false);
// stage 6.
- bufferfly_rotation(&s[2], &s[3], 32, true);
- bufferfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
// stage 7.
const __m128i v_zero = _mm_setzero_si128();
@@ -1213,19 +1186,16 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[8];
- const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1283,20 +1253,16 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst8DcOnlyColumn(void* dest, int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[8];
int i = 0;
do {
- const __m128i v_src = LoadLo8(&src[i]);
+ const __m128i v_src = LoadLo8(dst);
// stage 1.
s[1] = v_src;
@@ -1342,32 +1308,31 @@
return true;
}
-template <ButterflyRotationFunc bufferfly_rotation, bool stage_is_rectangular>
-LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, const void* source,
- int32_t step, bool transpose) {
+template <ButterflyRotationFunc butterfly_rotation, bool stage_is_rectangular>
+LIBGAV1_ALWAYS_INLINE void Adst16_SSE4_1(void* dest, int32_t step,
+ bool transpose) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[16], x[16];
if (stage_is_rectangular) {
if (transpose) {
__m128i input[4];
- LoadSrc<16, 4>(src, step, 0, input);
+ LoadSrc<16, 4>(dst, step, 0, input);
Transpose8x4To4x8_U16(input, x);
- LoadSrc<16, 4>(src, step, 8, input);
+ LoadSrc<16, 4>(dst, step, 8, input);
Transpose8x4To4x8_U16(input, &x[8]);
} else {
- LoadSrc<8, 16>(src, step, 0, x);
+ LoadSrc<8, 16>(dst, step, 0, x);
}
} else {
if (transpose) {
for (int idx = 0; idx < 16; idx += 8) {
__m128i input[8];
- LoadSrc<16, 8>(src, step, idx, input);
+ LoadSrc<16, 8>(dst, step, idx, input);
Transpose8x8_U16(input, &x[idx]);
}
} else {
- LoadSrc<16, 16>(src, step, 0, x);
+ LoadSrc<16, 16>(dst, step, 0, x);
}
}
@@ -1390,14 +1355,14 @@
s[15] = x[14];
// stage 2.
- bufferfly_rotation(&s[0], &s[1], 62 - 0, true);
- bufferfly_rotation(&s[2], &s[3], 62 - 8, true);
- bufferfly_rotation(&s[4], &s[5], 62 - 16, true);
- bufferfly_rotation(&s[6], &s[7], 62 - 24, true);
- bufferfly_rotation(&s[8], &s[9], 62 - 32, true);
- bufferfly_rotation(&s[10], &s[11], 62 - 40, true);
- bufferfly_rotation(&s[12], &s[13], 62 - 48, true);
- bufferfly_rotation(&s[14], &s[15], 62 - 56, true);
+ butterfly_rotation(&s[0], &s[1], 62 - 0, true);
+ butterfly_rotation(&s[2], &s[3], 62 - 8, true);
+ butterfly_rotation(&s[4], &s[5], 62 - 16, true);
+ butterfly_rotation(&s[6], &s[7], 62 - 24, true);
+ butterfly_rotation(&s[8], &s[9], 62 - 32, true);
+ butterfly_rotation(&s[10], &s[11], 62 - 40, true);
+ butterfly_rotation(&s[12], &s[13], 62 - 48, true);
+ butterfly_rotation(&s[14], &s[15], 62 - 56, true);
// stage 3.
HadamardRotation(&s[0], &s[8], false);
@@ -1410,10 +1375,10 @@
HadamardRotation(&s[7], &s[15], false);
// stage 4.
- bufferfly_rotation(&s[8], &s[9], 56 - 0, true);
- bufferfly_rotation(&s[13], &s[12], 8 + 0, true);
- bufferfly_rotation(&s[10], &s[11], 56 - 32, true);
- bufferfly_rotation(&s[15], &s[14], 8 + 32, true);
+ butterfly_rotation(&s[8], &s[9], 56 - 0, true);
+ butterfly_rotation(&s[13], &s[12], 8 + 0, true);
+ butterfly_rotation(&s[10], &s[11], 56 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 8 + 32, true);
// stage 5.
HadamardRotation(&s[0], &s[4], false);
@@ -1426,10 +1391,10 @@
HadamardRotation(&s[11], &s[15], false);
// stage 6.
- bufferfly_rotation(&s[4], &s[5], 48 - 0, true);
- bufferfly_rotation(&s[12], &s[13], 48 - 0, true);
- bufferfly_rotation(&s[7], &s[6], 48 - 32, true);
- bufferfly_rotation(&s[15], &s[14], 48 - 32, true);
+ butterfly_rotation(&s[4], &s[5], 48 - 0, true);
+ butterfly_rotation(&s[12], &s[13], 48 - 0, true);
+ butterfly_rotation(&s[7], &s[6], 48 - 32, true);
+ butterfly_rotation(&s[15], &s[14], 48 - 32, true);
// stage 7.
HadamardRotation(&s[0], &s[2], false);
@@ -1442,10 +1407,10 @@
HadamardRotation(&s[13], &s[15], false);
// stage 8.
- bufferfly_rotation(&s[2], &s[3], 32, true);
- bufferfly_rotation(&s[6], &s[7], 32, true);
- bufferfly_rotation(&s[10], &s[11], 32, true);
- bufferfly_rotation(&s[14], &s[15], 32, true);
+ butterfly_rotation(&s[2], &s[3], 32, true);
+ butterfly_rotation(&s[6], &s[7], 32, true);
+ butterfly_rotation(&s[10], &s[11], 32, true);
+ butterfly_rotation(&s[14], &s[15], 32, true);
// stage 9.
const __m128i v_zero = _mm_setzero_si128();
@@ -1546,20 +1511,17 @@
x[15] = _mm_subs_epi16(v_zero, s[1]);
}
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
__m128i s[16];
__m128i x[16];
- const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(src[0]), 0);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_src = _mm_shufflelo_epi16(_mm_cvtsi32_si128(dst[0]), 0);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1589,21 +1551,17 @@
return true;
}
-LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Adst16DcOnlyColumn(void* dest,
+ int adjusted_tx_height,
int width) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
int i = 0;
do {
__m128i s[16];
__m128i x[16];
- const __m128i v_src = LoadUnaligned16(&src[i]);
+ const __m128i v_src = LoadUnaligned16(dst);
// stage 1.
s[1] = v_src;
@@ -1623,10 +1581,8 @@
// Identity Transforms.
template <bool is_row_shift>
-LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity4_SSE4_1(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
if (is_row_shift) {
const int shift = 1;
@@ -1634,7 +1590,7 @@
const __m128i v_multiplier_one =
_mm_set1_epi32((kIdentity4Multiplier << 16) | 0x0001);
for (int i = 0; i < 4; i += 2) {
- const __m128i v_src = LoadUnaligned16(&src[i * step]);
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
const __m128i v_src_round = _mm_unpacklo_epi16(v_dual_round, v_src);
const __m128i v_src_round_hi = _mm_unpackhi_epi16(v_dual_round, v_src);
const __m128i a = _mm_madd_epi16(v_src_round, v_multiplier_one);
@@ -1647,7 +1603,7 @@
const __m128i v_multiplier =
_mm_set1_epi16(kIdentity4MultiplierFraction << 3);
for (int i = 0; i < 4; i += 2) {
- const __m128i v_src = LoadUnaligned16(&src[i * step]);
+ const __m128i v_src = LoadUnaligned16(&dst[i * step]);
const __m128i a = _mm_mulhrs_epi16(v_src, v_multiplier);
const __m128i b = _mm_adds_epi16(a, v_src);
StoreUnaligned16(&dst[i * step], b);
@@ -1655,18 +1611,14 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity4DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int tx_height) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1786,29 +1738,25 @@
}
}
-LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row32_SSE4_1(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
// When combining the identity8 multiplier with the row shift, the
// calculations for tx_height equal to 32 can be simplified from
// ((A * 2) + 2) >> 2) to ((A + 1) >> 1).
const __m128i v_row_multiplier = _mm_set1_epi16(1 << 14);
for (int h = 0; h < 4; ++h) {
- const __m128i v_src = LoadUnaligned16(&src[h * step]);
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
const __m128i v_src_mult = _mm_mulhrs_epi16(v_src, v_row_multiplier);
StoreUnaligned16(&dst[h * step], v_src_mult);
}
}
-LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, const void* source,
- int32_t step) {
+LIBGAV1_ALWAYS_INLINE void Identity8Row4_SSE4_1(void* dest, int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
for (int h = 0; h < 4; ++h) {
- const __m128i v_src = LoadUnaligned16(&src[h * step]);
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
// For bitdepth == 8, the identity row clamps to a signed 16bit value, so
// saturating add here is ok.
const __m128i a = _mm_adds_epi16(v_src, v_src);
@@ -1816,18 +1764,14 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity8DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int row_shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round =
@@ -1884,10 +1828,9 @@
}
}
-LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, const void* source,
- int32_t step, int shift) {
+LIBGAV1_ALWAYS_INLINE void Identity16Row_SSE4_1(void* dest, int32_t step,
+ int shift) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
const __m128i v_dual_round = _mm_set1_epi16((1 + (shift << 1)) << 11);
const __m128i v_multiplier_one =
@@ -1895,8 +1838,8 @@
const __m128i v_shift = _mm_set_epi64x(0, 12 + shift);
for (int h = 0; h < 4; ++h) {
- const __m128i v_src = LoadUnaligned16(&src[h * step]);
- const __m128i v_src2 = LoadUnaligned16(&src[h * step + 8]);
+ const __m128i v_src = LoadUnaligned16(&dst[h * step]);
+ const __m128i v_src2 = LoadUnaligned16(&dst[h * step + 8]);
const __m128i v_src_round0 = _mm_unpacklo_epi16(v_dual_round, v_src);
const __m128i v_src_round1 = _mm_unpackhi_epi16(v_dual_round, v_src);
const __m128i v_src2_round0 = _mm_unpacklo_epi16(v_dual_round, v_src2);
@@ -1914,18 +1857,14 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, const void* source,
- int non_zero_coeff_count,
+LIBGAV1_ALWAYS_INLINE bool Identity16DcOnly(void* dest, int adjusted_tx_height,
bool should_round, int shift) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
- const __m128i v_mask = _mm_set1_epi16(should_round ? 0xffff : 0);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
+ const __m128i v_mask =
+ _mm_set1_epi16(should_round ? static_cast<int16_t>(0xffff) : 0);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src_round0 =
@@ -1990,17 +1929,15 @@
}
LIBGAV1_ALWAYS_INLINE void Identity32Row16_SSE4_1(void* dest,
- const void* source,
const int32_t step) {
auto* const dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
// When combining the identity32 multiplier with the row shift, the
// calculation for tx_height equal to 16 can be simplified from
// ((A * 4) + 1) >> 1) to (A * 2).
for (int h = 0; h < 4; ++h) {
for (int i = 0; i < 32; i += 8) {
- const __m128i v_src = LoadUnaligned16(&src[h * step + i]);
+ const __m128i v_src = LoadUnaligned16(&dst[h * step + i]);
// For bitdepth == 8, the identity row clamps to a signed 16bit value, so
// saturating add here is ok.
const __m128i v_dst_i = _mm_adds_epi16(v_src, v_src);
@@ -2009,16 +1946,12 @@
}
}
-LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest, const void* source,
- int non_zero_coeff_count) {
- if (non_zero_coeff_count > 1) {
- return false;
- }
+LIBGAV1_ALWAYS_INLINE bool Identity32DcOnly(void* dest,
+ int adjusted_tx_height) {
+ if (adjusted_tx_height > 1) return false;
auto* dst = static_cast<int16_t*>(dest);
- const auto* const src = static_cast<const int16_t*>(source);
-
- const __m128i v_src0 = _mm_cvtsi32_si128(src[0]);
+ const __m128i v_src0 = _mm_cvtsi32_si128(dst[0]);
const __m128i v_kTransformRowMultiplier =
_mm_set1_epi16(kTransformRowMultiplier << 3);
const __m128i v_src = _mm_mulhrs_epi16(v_src0, v_kTransformRowMultiplier);
@@ -2063,11 +1996,11 @@
LIBGAV1_ALWAYS_INLINE void Wht4_SSE4_1(Array2DView<uint8_t> frame,
const int start_x, const int start_y,
const void* source,
- const int non_zero_coeff_count) {
+ const int adjusted_tx_height) {
const auto* const src = static_cast<const int16_t*>(source);
__m128i s[4], x[4];
- if (non_zero_coeff_count == 1) {
+ if (adjusted_tx_height == 1) {
// Special case: only src[0] is nonzero.
// src[0] 0 0 0
// 0 0 0 0
@@ -2292,479 +2225,459 @@
}
}
-void Dct4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ const int row_shift = static_cast<int>(tx_height == 16);
- if (is_row) {
- const bool should_round = (tx_height == 8);
- const int row_shift = static_cast<int>(tx_height == 16);
-
- if (DctDcOnly<4>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct4 rows in parallel.
- Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], /*step=*/4,
- /*transpose=*/true);
- } else {
- // Process 8 1d dct4 rows in parallel per iteration.
- int i = 0;
- do {
- Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], &src[i * 4],
- /*step=*/4, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (tx_height == 16) {
- RowShift<4>(src, num_rows, 1);
- }
+ if (DctDcOnly<4>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct4 rows in parallel.
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, /*step=*/4,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d dct4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i * 4], /*step=*/4,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (tx_height == 16) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Dct4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<4>(src, tx_width);
}
- if (!DctDcOnlyColumn<4>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<4>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct4 columns in parallel.
- Dct4_SSE4_1<ButterflyRotation_4, false>(&src[0], &src[0], tx_width,
+ Dct4_SSE4_1<ButterflyRotation_4, false>(src, tx_width,
/*transpose=*/false);
} else {
// Process 8 1d dct4 columns in parallel per iteration.
int i = 0;
do {
- Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], &src[i], tx_width,
+ Dct4_SSE4_1<ButterflyRotation_8, true>(&src[i], tx_width,
/*transpose=*/false);
i += 8;
} while (i < tx_width);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound(frame, start_x, start_y, tx_width, 4, src, tx_type);
}
-void Dct8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<8>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct8 rows in parallel.
- Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
- /*transpose=*/true);
- } else {
- // Process 8 1d dct8 rows in parallel per iteration.
- int i = 0;
- do {
- Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
- /*step=*/8, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (row_shift > 0) {
- RowShift<8>(src, num_rows, row_shift);
- }
+ if (DctDcOnly<8>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct8 rows in parallel.
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8, /*transpose=*/true);
+ } else {
+ // Process 8 1d dct8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Dct8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- if (!DctDcOnlyColumn<8>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<8>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct8 columns in parallel.
- Dct8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Dct8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
// Process 8 1d dct8 columns in parallel per iteration.
int i = 0;
do {
- Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Dct8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
/*transpose=*/false);
i += 8;
} while (i < tx_width);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound(frame, start_x, start_y, tx_width, 8, src, tx_type);
}
-void Dct16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<16>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d dct16 rows in parallel.
- Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
- /*transpose=*/true);
- } else {
- int i = 0;
- do {
- // Process 8 1d dct16 rows in parallel per iteration.
- Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16], 16,
- /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- // row_shift is always non zero here.
- RowShift<16>(src, num_rows, row_shift);
-
+ if (DctDcOnly<16>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d dct16 rows in parallel.
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d dct16 rows in parallel per iteration.
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- if (!DctDcOnlyColumn<16>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!DctDcOnlyColumn<16>(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d dct16 columns in parallel.
- Dct16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Dct16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
int i = 0;
do {
// Process 8 1d dct16 columns in parallel per iteration.
- Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Dct16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
/*transpose=*/false);
i += 8;
} while (i < tx_width);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound(frame, start_x, start_y, tx_width, 16, src, tx_type);
}
-void Dct32TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<32>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<32>(src, num_rows);
- }
- // Process 8 1d dct32 rows in parallel per iteration.
- int i = 0;
- do {
- Dct32_SSE4_1(&src[i * 32], &src[i * 32], 32, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- // row_shift is always non zero here.
- RowShift<32>(src, num_rows, row_shift);
-
+ if (DctDcOnly<32>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
- if (!DctDcOnlyColumn<32>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (should_round) {
+ ApplyRounding<32>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct32 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct32_SSE4_1(&src[i * 32], 32, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<32>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct32TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<32>(src, adjusted_tx_height, tx_width)) {
// Process 8 1d dct32 columns in parallel per iteration.
int i = 0;
do {
- Dct32_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
+ Dct32_SSE4_1(&src[i], tx_width, /*transpose=*/false);
i += 8;
} while (i < tx_width);
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound(frame, start_x, start_y, tx_width, 32, src, tx_type);
}
-void Dct64TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Dct64TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (DctDcOnly<64>(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<64>(src, num_rows);
- }
- // Process 8 1d dct64 rows in parallel per iteration.
- int i = 0;
- do {
- Dct64_SSE4_1(&src[i * 64], &src[i * 64], 64, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- // row_shift is always non zero here.
- RowShift<64>(src, num_rows, row_shift);
-
+ if (DctDcOnly<64>(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
- if (!DctDcOnlyColumn<64>(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (should_round) {
+ ApplyRounding<64>(src, adjusted_tx_height);
+ }
+ // Process 8 1d dct64 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Dct64_SSE4_1(&src[i * 64], 64, /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ // row_shift is always non zero here.
+ RowShift<64>(src, adjusted_tx_height, row_shift);
+}
+
+void Dct64TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ if (!DctDcOnlyColumn<64>(src, adjusted_tx_height, tx_width)) {
// Process 8 1d dct64 columns in parallel per iteration.
int i = 0;
do {
- Dct64_SSE4_1(&src[i], &src[i], tx_width, /*transpose=*/false);
+ Dct64_SSE4_1(&src[i], tx_width, /*transpose=*/false);
i += 8;
} while (i < tx_width);
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound(frame, start_x, start_y, tx_width, 64, src, tx_type);
}
-void Adst4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst4TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
const int tx_height = kTransformHeight[tx_size];
+ const int row_shift = static_cast<int>(tx_height == 16);
+ const bool should_round = (tx_height == 8);
- if (is_row) {
- const uint8_t row_shift = static_cast<uint8_t>(tx_height == 16);
- const bool should_round = (tx_height == 8);
-
- if (Adst4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
-
- // Process 4 1d adst4 rows in parallel per iteration.
- int i = 0;
- do {
- Adst4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4,
- /*transpose=*/true);
- i += 4;
- } while (i < num_rows);
-
- if (row_shift != 0u) {
- RowShift<4>(src, num_rows, 1);
- }
+ if (Adst4DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+
+ // Process 4 1d adst4 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst4_SSE4_1<false>(&src[i * 4], /*step=*/4, /*transpose=*/true);
+ i += 4;
+ } while (i < adjusted_tx_height);
+
+ if (row_shift != 0) {
+ RowShift<4>(src, adjusted_tx_height, 1);
+ }
+}
+
+void Adst4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<4>(src, tx_width);
}
- if (!Adst4DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst4DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
// Process 4 1d adst4 columns in parallel per iteration.
int i = 0;
do {
- Adst4_SSE4_1<false>(&src[i], &src[i], tx_width, /*transpose=*/false);
+ Adst4_SSE4_1<false>(&src[i], tx_width, /*transpose=*/false);
i += 4;
} while (i < tx_width);
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
tx_width, 4, src, tx_type);
}
-void Adst8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst8TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size, int adjusted_tx_height,
+ void* src_buffer, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Adst8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d adst8 rows in parallel.
- Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], /*step=*/8,
- /*transpose=*/true);
- } else {
- // Process 8 1d adst8 rows in parallel per iteration.
- int i = 0;
- do {
- Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], &src[i * 8],
- /*step=*/8,
- /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- if (row_shift > 0) {
- RowShift<8>(src, num_rows, row_shift);
- }
+ if (Adst8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst8 rows in parallel.
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, /*step=*/8,
+ /*transpose=*/true);
+ } else {
+ // Process 8 1d adst8 rows in parallel per iteration.
+ int i = 0;
+ do {
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i * 8], /*step=*/8,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ if (row_shift > 0) {
+ RowShift<8>(src, adjusted_tx_height, row_shift);
+ }
+}
+
+void Adst8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- if (!Adst8DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst8DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d adst8 columns in parallel.
- Adst8_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Adst8_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
// Process 8 1d adst8 columns in parallel per iteration.
int i = 0;
do {
- Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Adst8_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
/*transpose=*/false);
i += 8;
} while (i < tx_width);
}
}
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
StoreToFrameWithRound</*enable_flip_rows=*/true>(frame, start_x, start_y,
tx_width, 8, src, tx_type);
}
-void Adst16TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Adst16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
-
- if (Adst16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
-
- if (num_rows <= 4) {
- // Process 4 1d adst16 rows in parallel.
- Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 16,
- /*transpose=*/true);
- } else {
- int i = 0;
- do {
- // Process 8 1d adst16 rows in parallel per iteration.
- Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], &src[i * 16],
- 16, /*transpose=*/true);
- i += 8;
- } while (i < num_rows);
- }
- // row_shift is always non zero here.
- RowShift<16>(src, num_rows, row_shift);
-
+ if (Adst16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+
+ if (adjusted_tx_height <= 4) {
+ // Process 4 1d adst16 rows in parallel.
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 16, /*transpose=*/true);
+ } else {
+ int i = 0;
+ do {
+ // Process 8 1d adst16 rows in parallel per iteration.
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i * 16], 16,
+ /*transpose=*/true);
+ i += 8;
+ } while (i < adjusted_tx_height);
+ }
+ // row_shift is always non zero here.
+ RowShift<16>(src, adjusted_tx_height, row_shift);
+}
+
+void Adst16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y,
+ void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- if (!Adst16DcOnlyColumn(&src[0], &src[0], non_zero_coeff_count, tx_width)) {
+ if (!Adst16DcOnlyColumn(src, adjusted_tx_height, tx_width)) {
if (tx_width == 4) {
// Process 4 1d adst16 columns in parallel.
- Adst16_SSE4_1<ButterflyRotation_4, true>(&src[0], &src[0], 4,
- /*transpose=*/false);
+ Adst16_SSE4_1<ButterflyRotation_4, true>(src, 4, /*transpose=*/false);
} else {
int i = 0;
do {
// Process 8 1d adst16 columns in parallel per iteration.
- Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], &src[i], tx_width,
+ Adst16_SSE4_1<ButterflyRotation_8, false>(&src[i], tx_width,
/*transpose=*/false);
i += 8;
} while (i < tx_width);
@@ -2774,56 +2687,57 @@
tx_width, 16, src, tx_type);
}
-void Identity4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
+void Identity4TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize4x4) {
+ return;
+ }
+
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = (tx_height == 8);
+ if (Identity4DcOnly(src, adjusted_tx_height, should_round, tx_height)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<4>(src, adjusted_tx_height);
+ }
+ if (tx_height < 16) {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<false>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ } else {
+ int i = 0;
+ do {
+ Identity4_SSE4_1<true>(&src[i * 4], /*step=*/4);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ }
+}
+
+void Identity4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
auto* src = static_cast<int16_t*>(src_buffer);
const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
- if (is_row) {
- // Special case: Process row calculations during column transform call.
- // Improves performance.
- if (tx_type == kTransformTypeIdentityIdentity &&
- tx_size == kTransformSize4x4) {
- return;
- }
-
- const bool should_round = (tx_height == 8);
- if (Identity4DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- tx_height)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<4>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<4>(src, num_rows);
- }
- if (tx_height < 16) {
- int i = 0;
- do {
- Identity4_SSE4_1<false>(&src[i * 4], &src[i * 4], /*step=*/4);
- i += 4;
- } while (i < num_rows);
- } else {
- int i = 0;
- do {
- Identity4_SSE4_1<true>(&src[i * 4], &src[i * 4], /*step=*/4);
- i += 4;
- } while (i < num_rows);
- }
- return;
- }
- assert(!is_row);
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
// Special case: Process row calculations during column transform call.
if (tx_type == kTransformTypeIdentityIdentity &&
(tx_size == kTransformSize4x4 || tx_size == kTransformSize8x4)) {
- Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width, height,
- src);
+ Identity4RowColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
return;
}
@@ -2831,274 +2745,272 @@
FlipColumns<4>(src, tx_width);
}
- Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
+ Identity4ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity8TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
- auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
-
- if (is_row) {
- // Special case: Process row calculations during column transform call.
- // Improves performance.
- if (tx_type == kTransformTypeIdentityIdentity &&
- tx_size == kTransformSize8x4) {
- return;
- }
-
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
- if (Identity8DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<8>(tx_type, tx_height, non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<8>(src, num_rows);
- }
-
- // When combining the identity8 multiplier with the row shift, the
- // calculations for tx_height == 8 and tx_height == 16 can be simplified
- // from ((A * 2) + 1) >> 1) to A.
- if ((tx_height & 0x18) != 0) {
- return;
- }
- if (tx_height == 32) {
- int i = 0;
- do {
- Identity8Row32_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
- i += 4;
- } while (i < num_rows);
- return;
- }
-
- // Process kTransformSize8x4
- assert(tx_size == kTransformSize8x4);
- int i = 0;
- do {
- Identity8Row4_SSE4_1(&src[i * 8], &src[i * 8], /*step=*/8);
- i += 4;
- } while (i < num_rows);
+void Identity8TransformLoopRow_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
+ // Special case: Process row calculations during column transform call.
+ // Improves performance.
+ if (tx_type == kTransformTypeIdentityIdentity &&
+ tx_size == kTransformSize8x4) {
return;
}
- assert(!is_row);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_height = kTransformHeight[tx_size];
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity8DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
+ return;
+ }
+
+ if (should_round) {
+ ApplyRounding<8>(src, adjusted_tx_height);
+ }
+
+ // When combining the identity8 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 16 can be simplified
+ // from ((A * 2) + 1) >> 1) to A.
+ if ((tx_height & 0x18) != 0) {
+ return;
+ }
+ if (tx_height == 32) {
+ int i = 0;
+ do {
+ Identity8Row32_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+ return;
+ }
+
+ assert(tx_size == kTransformSize8x4);
+ int i = 0;
+ do {
+ Identity8Row4_SSE4_1(&src[i * 8], /*step=*/8);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity8TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<8>(src, tx_width);
}
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
- src);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity8ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity16TransformLoop_SSE4_1(TransformType tx_type,
- TransformSize tx_size, void* src_buffer,
- int start_x, int start_y, void* dst_frame,
- bool is_row, int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+void Identity16TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
- const int tx_height = kTransformHeight[tx_size];
-
- if (is_row) {
- const bool should_round = kShouldRound[tx_size];
- const uint8_t row_shift = kTransformRowShift[tx_size];
- if (Identity16DcOnly(&src[0], &src[0], non_zero_coeff_count, should_round,
- row_shift)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<16>(tx_type, std::min(tx_height, 32), non_zero_coeff_count);
- if (should_round) {
- ApplyRounding<16>(src, num_rows);
- }
- int i = 0;
- do {
- Identity16Row_SSE4_1(&src[i * 16], &src[i * 16], /*step=*/16,
- kTransformRowShift[tx_size]);
- i += 4;
- } while (i < num_rows);
+ const bool should_round = kShouldRound[tx_size];
+ const uint8_t row_shift = kTransformRowShift[tx_size];
+ if (Identity16DcOnly(src, adjusted_tx_height, should_round, row_shift)) {
return;
}
- assert(!is_row);
+ if (should_round) {
+ ApplyRounding<16>(src, adjusted_tx_height);
+ }
+ int i = 0;
+ do {
+ Identity16Row_SSE4_1(&src[i * 16], /*step=*/16,
+ kTransformRowShift[tx_size]);
+ i += 4;
+ } while (i < adjusted_tx_height);
+}
+
+void Identity16TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
if (kTransformFlipColumnsMask.Contains(tx_type)) {
FlipColumns<16>(src, tx_width);
}
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width, height,
- src);
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ Identity16ColumnStoreToFrame_SSE4_1(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
}
-void Identity32TransformLoop_SSE4_1(TransformType tx_type,
- TransformSize tx_size, void* src_buffer,
- int start_x, int start_y, void* dst_frame,
- bool is_row, int non_zero_coeff_count) {
- auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
- auto* src = static_cast<int16_t*>(src_buffer);
- const int tx_width = kTransformWidth[tx_size];
+void Identity32TransformLoopRow_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int /*start_x*/, int /*start_y*/,
+ void* /*dst_frame*/) {
const int tx_height = kTransformHeight[tx_size];
-
- if (is_row) {
- // When combining the identity32 multiplier with the row shift, the
- // calculations for tx_height == 8 and tx_height == 32 can be simplified
- // from ((A * 4) + 2) >> 2) to A.
- if ((tx_height & 0x28) != 0) {
- return;
- }
-
- // Process kTransformSize32x16. The src is always rounded before the
- // identity transform and shifted by 1 afterwards.
-
- if (Identity32DcOnly(&src[0], &src[0], non_zero_coeff_count)) {
- return;
- }
-
- const int num_rows =
- GetNumRows<32>(tx_type, tx_height, non_zero_coeff_count);
-
- // Process kTransformSize32x16
- assert(tx_size == kTransformSize32x16);
- ApplyRounding<32>(src, num_rows);
- int i = 0;
- do {
- Identity32Row16_SSE4_1(&src[i * 32], &src[i * 32], /*step=*/32);
- i += 4;
- } while (i < num_rows);
+ // When combining the identity32 multiplier with the row shift, the
+ // calculations for tx_height == 8 and tx_height == 32 can be simplified
+ // from ((A * 4) + 2) >> 2) to A.
+ if ((tx_height & 0x28) != 0) {
return;
}
- assert(!is_row);
- const int height = (non_zero_coeff_count == 1) ? 1 : tx_height;
- Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width, height, src);
+ // Process kTransformSize32x16. The src is always rounded before the
+ // identity transform and shifted by 1 afterwards.
+ auto* src = static_cast<int16_t*>(src_buffer);
+ if (Identity32DcOnly(src, adjusted_tx_height)) {
+ return;
+ }
+
+ assert(tx_size == kTransformSize32x16);
+ ApplyRounding<32>(src, adjusted_tx_height);
+ int i = 0;
+ do {
+ Identity32Row16_SSE4_1(&src[i * 32], /*step=*/32);
+ i += 4;
+ } while (i < adjusted_tx_height);
}
-void Wht4TransformLoop_SSE4_1(TransformType tx_type, TransformSize tx_size,
- void* src_buffer, int start_x, int start_y,
- void* dst_frame, bool is_row,
- int non_zero_coeff_count) {
+void Identity32TransformLoopColumn_SSE4_1(TransformType /*tx_type*/,
+ TransformSize tx_size,
+ int adjusted_tx_height,
+ void* src_buffer, int start_x,
+ int start_y, void* dst_frame) {
+ auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
+ auto* src = static_cast<int16_t*>(src_buffer);
+ const int tx_width = kTransformWidth[tx_size];
+
+ Identity32ColumnStoreToFrame(frame, start_x, start_y, tx_width,
+ adjusted_tx_height, src);
+}
+
+void Wht4TransformLoopRow_SSE4_1(TransformType tx_type, TransformSize tx_size,
+ int /*adjusted_tx_height*/,
+ void* /*src_buffer*/, int /*start_x*/,
+ int /*start_y*/, void* /*dst_frame*/) {
assert(tx_type == kTransformTypeDctDct);
assert(tx_size == kTransformSize4x4);
static_cast<void>(tx_type);
static_cast<void>(tx_size);
- if (is_row) {
- // Do both row and column transforms in the column-transform pass.
- return;
- }
+ // Do both row and column transforms in the column-transform pass.
+}
- assert(!is_row);
+void Wht4TransformLoopColumn_SSE4_1(TransformType tx_type,
+ TransformSize tx_size,
+ int adjusted_tx_height, void* src_buffer,
+ int start_x, int start_y, void* dst_frame) {
+ assert(tx_type == kTransformTypeDctDct);
+ assert(tx_size == kTransformSize4x4);
+ static_cast<void>(tx_type);
+ static_cast<void>(tx_size);
+
+ // Do both row and column transforms in the column-transform pass.
// Process 4 1d wht4 rows and columns in parallel.
const auto* src = static_cast<int16_t*>(src_buffer);
auto& frame = *static_cast<Array2DView<uint8_t>*>(dst_frame);
- Wht4_SSE4_1(frame, start_x, start_y, src, non_zero_coeff_count);
+ Wht4_SSE4_1(frame, start_x, start_y, src, adjusted_tx_height);
}
//------------------------------------------------------------------------------
-template <typename Residual, typename Pixel>
-void InitAll(Dsp* const dsp) {
- // Maximum transform size for Dct is 64.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
- Dct4TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
- Dct8TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
- Dct16TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
- Dct32TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
- Dct64TransformLoop_SSE4_1;
-
- // Maximum transform size for Adst is 16.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
- Adst4TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
- Adst8TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
- Adst16TransformLoop_SSE4_1;
-
- // Maximum transform size for Identity transform is 32.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
- Identity4TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
- Identity8TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
- Identity16TransformLoop_SSE4_1;
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
- Identity32TransformLoop_SSE4_1;
-
- // Maximum transform size for Wht is 4.
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
- Wht4TransformLoop_SSE4_1;
-}
-
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
-#if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
- InitAll<int16_t, uint8_t>(dsp);
-#else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
+
+ // Maximum transform size for Dct is 64.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformDct)
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformDct] =
- Dct4TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kRow] =
+ Dct4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize4][kColumn] =
+ Dct4TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformDct)
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformDct] =
- Dct8TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kRow] =
+ Dct8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize8][kColumn] =
+ Dct8TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformDct)
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformDct] =
- Dct16TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kRow] =
+ Dct16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize16][kColumn] =
+ Dct16TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformDct)
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformDct] =
- Dct32TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kRow] =
+ Dct32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize32][kColumn] =
+ Dct32TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize64_1DTransformDct)
- dsp->inverse_transforms[k1DTransformSize64][k1DTransformDct] =
- Dct64TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kRow] =
+ Dct64TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformDct][k1DTransformSize64][kColumn] =
+ Dct64TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Adst is 16.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformAdst] =
- Adst4TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kRow] =
+ Adst4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize4][kColumn] =
+ Adst4TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformAdst] =
- Adst8TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kRow] =
+ Adst8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize8][kColumn] =
+ Adst8TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformAdst)
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformAdst] =
- Adst16TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kRow] =
+ Adst16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformAdst][k1DTransformSize16][kColumn] =
+ Adst16TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Identity transform is 32.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformIdentity] =
- Identity4TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kRow] =
+ Identity4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize4][kColumn] =
+ Identity4TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize8_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformSize8][k1DTransformIdentity] =
- Identity8TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kRow] =
+ Identity8TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize8][kColumn] =
+ Identity8TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize16_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformSize16][k1DTransformIdentity] =
- Identity16TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kRow] =
+ Identity16TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize16][kColumn] =
+ Identity16TransformLoopColumn_SSE4_1;
#endif
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize32_1DTransformIdentity)
- dsp->inverse_transforms[k1DTransformSize32][k1DTransformIdentity] =
- Identity32TransformLoop_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kRow] =
+ Identity32TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformIdentity][k1DTransformSize32][kColumn] =
+ Identity32TransformLoopColumn_SSE4_1;
#endif
+
+ // Maximum transform size for Wht is 4.
#if DSP_ENABLED_8BPP_SSE4_1(1DTransformSize4_1DTransformWht)
- dsp->inverse_transforms[k1DTransformSize4][k1DTransformWht] =
- Wht4TransformLoop_SSE4_1;
-#endif
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kRow] =
+ Wht4TransformLoopRow_SSE4_1;
+ dsp->inverse_transforms[k1DTransformWht][k1DTransformSize4][kColumn] =
+ Wht4TransformLoopColumn_SSE4_1;
#endif
}
@@ -3109,7 +3021,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -3117,4 +3029,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/inverse_transform_sse4.h b/libgav1/src/dsp/x86/inverse_transform_sse4.h
index 423173b..106084b 100644
--- a/libgav1/src/dsp/x86/inverse_transform_sse4.h
+++ b/libgav1/src/dsp/x86/inverse_transform_sse4.h
@@ -32,7 +32,7 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct
#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformDct LIBGAV1_CPU_SSE4_1
@@ -85,5 +85,5 @@
#ifndef LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht
#define LIBGAV1_Dsp8bpp_1DTransformSize4_1DTransformWht LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_INVERSE_TRANSFORM_SSE4_H_
diff --git a/libgav1/src/dsp/x86/loop_filter_sse4.cc b/libgav1/src/dsp/x86/loop_filter_sse4.cc
index 462b885..b9da2d5 100644
--- a/libgav1/src/dsp/x86/loop_filter_sse4.cc
+++ b/libgav1/src/dsp/x86/loop_filter_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/loop_filter.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -350,7 +350,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -454,7 +454,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat3_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -595,7 +595,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -697,7 +697,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -838,7 +838,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i p6 = Load4(dst - 7 * stride);
const __m128i p5 = Load4(dst - 6 * stride);
const __m128i p4 = Load4(dst - 5 * stride);
@@ -864,8 +864,7 @@
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -1050,7 +1049,7 @@
const __m128i v_mask =
_mm_shuffle_epi32(_mm_and_si128(v_needs_mask, v_isflat4_mask), 0);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi8(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i v_isflatouter4_mask =
IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
const __m128i v_flat4_mask =
@@ -1066,8 +1065,7 @@
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi8(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -1458,7 +1456,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -1572,7 +1570,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat3_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp1_f6;
__m128i oqp0_f6;
@@ -1711,7 +1709,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -1821,7 +1819,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
__m128i oqp2_f8;
__m128i oqp1_f8;
__m128i oqp0_f8;
@@ -1957,7 +1955,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i p6 = LoadLo8(dst - 7 * stride);
const __m128i p5 = LoadLo8(dst - 6 * stride);
const __m128i p4 = LoadLo8(dst - 5 * stride);
@@ -1984,8 +1982,7 @@
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -2133,7 +2130,7 @@
const __m128i v_mask_lo = _mm_and_si128(v_needs_mask, v_isflat4_mask);
const __m128i v_mask = _mm_unpacklo_epi64(v_mask_lo, v_mask_lo);
- if (_mm_test_all_zeros(v_mask, _mm_cmpeq_epi16(v_mask, v_mask)) == 0) {
+ if (_mm_test_all_zeros(v_mask, v_mask) == 0) {
const __m128i v_isflatouter4_mask =
IsFlat4(qp6, qp5, qp4, qp0, v_flat_thresh);
const __m128i v_flat4_mask_lo = _mm_and_si128(v_mask, v_isflatouter4_mask);
@@ -2150,8 +2147,7 @@
oqp1 = _mm_blendv_epi8(oqp1, oqp1_f8, v_mask);
oqp0 = _mm_blendv_epi8(oqp0, oqp0_f8, v_mask);
- if (_mm_test_all_zeros(v_flat4_mask,
- _mm_cmpeq_epi16(v_flat4_mask, v_flat4_mask)) == 0) {
+ if (_mm_test_all_zeros(v_flat4_mask, v_flat4_mask) == 0) {
__m128i oqp5_f14;
__m128i oqp4_f14;
__m128i oqp3_f14;
@@ -2245,7 +2241,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -2253,4 +2249,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/loop_filter_sse4.h b/libgav1/src/dsp/x86/loop_filter_sse4.h
index b8c1fe5..4795d8b 100644
--- a/libgav1/src/dsp/x86/loop_filter_sse4.h
+++ b/libgav1/src/dsp/x86/loop_filter_sse4.h
@@ -32,7 +32,7 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal
#define LIBGAV1_Dsp8bpp_LoopFilterSize4_LoopFilterTypeHorizontal \
@@ -114,6 +114,6 @@
LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_LOOP_FILTER_SSE4_H_
diff --git a/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc b/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc
new file mode 100644
index 0000000..b38f322
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_10bit_avx2.cc
@@ -0,0 +1,3157 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m256i sum0 = _mm256_add_epi32(s[0], round);
+ const __m256i sum1 = _mm256_add_epi32(s[1], round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum1 =
+ _mm256_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m256i rounded_sum = _mm256_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7Kernel(const __m256i s[7],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const __m256i s06 = _mm256_add_epi16(s[0], s[6]);
+ const __m256i s15 = _mm256_add_epi16(s[1], s[5]);
+ const __m256i s24 = _mm256_add_epi16(s[2], s[4]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s06, s15);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s06, s15);
+ const __m256i ss2 = _mm256_unpacklo_epi16(s24, s[3]);
+ const __m256i ss3 = _mm256_unpackhi_epi16(s24, s[3]);
+ __m256i madds[4];
+ madds[0] = _mm256_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm256_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm256_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm256_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm256_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[5], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s04 = _mm256_add_epi16(s[0], s[4]);
+ const __m256i s13 = _mm256_add_epi16(s[1], s[3]);
+ const __m256i s2d = _mm256_add_epi16(s[2], s[2]);
+ const __m256i s0m = _mm256_sub_epi16(s04, s2d);
+ const __m256i s1m = _mm256_sub_epi16(s13, s2d);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s0m, s1m);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s0m, s1m);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ const __m256i s2_lo = _mm256_unpacklo_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2_hi = _mm256_unpackhi_epi16(s[2], _mm256_setzero_si256());
+ const __m256i s2x128_lo = _mm256_slli_epi32(s2_lo, 7);
+ const __m256i s2x128_hi = _mm256_slli_epi32(s2_hi, 7);
+ madds[0] = _mm256_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm256_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[3], const __m256i filter,
+ int16_t* const wiener_buffer) {
+ const __m256i s02 = _mm256_add_epi16(s[0], s[2]);
+ const __m256i ss0 = _mm256_unpacklo_epi16(s02, s[1]);
+ const __m256i ss1 = _mm256_unpackhi_epi16(s02, s[1]);
+ __m256i madds[2];
+ madds[0] = _mm256_madd_epi16(ss0, filter);
+ madds[1] = _mm256_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(*coefficients, 0x0);
+ filter[1] = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[7];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ s[5] = LoadUnaligned32(src + x + 5);
+ s[6] = LoadUnaligned32(src + x + 6);
+ WienerHorizontalTap7Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const __m256i filter =
+ _mm256_shuffle_epi8(*coefficients, _mm256_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[5];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ s[3] = LoadUnaligned32(src + x + 3);
+ s[4] = LoadUnaligned32(src + x + 4);
+ WienerHorizontalTap5Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i* const coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm256_shuffle_epi32(*coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i s[3];
+ s[0] = LoadUnaligned32(src + x + 0);
+ s[1] = LoadUnaligned32(src + x + 1);
+ s[2] = LoadUnaligned32(src + x + 2);
+ WienerHorizontalTap3Kernel(s, filter, *wiener_buffer + x);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s0 = LoadUnaligned32(src + x);
+ const __m256i d0 = _mm256_slli_epi16(s0, 4);
+ StoreAligned32(*wiener_buffer + x, d0);
+ x += 16;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[4], const __m256i filter[4]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd3 = _mm256_madd_epi16(a[3], filter[3]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i madd23 = _mm256_add_epi32(madd2, madd3);
+ const __m256i sum = _mm256_add_epi32(madd01, madd23);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[3], const __m256i filter[3]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i madd2 = _mm256_madd_epi16(a[2], filter[2]);
+ const __m256i madd01 = _mm256_add_epi32(madd0, madd1);
+ const __m256i sum = _mm256_add_epi32(madd01, madd2);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalClip(const __m256i s[2]) {
+ const __m256i d = _mm256_packus_epi32(s[0], s[1]);
+ return _mm256_min_epu16(d, _mm256_set1_epi16(1023));
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[4], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm256_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm256_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[3]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[3], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm256_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm256_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2], c[2];
+ b[0] = _mm256_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[3], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ filter[2] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[3] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi8(c, _mm256_set1_epi32(0x03020504));
+ filter[2] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x, width, filter, d);
+ StoreUnaligned32(dst + x, d[0]);
+ StoreUnaligned32(dst + dst_stride + x, d[1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m256i filter[2];
+ filter[0] =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x, width, filter, d[0]);
+ StoreUnaligned32(dst + x, d[0][0]);
+ StoreUnaligned32(dst + dst_stride + x, d[0][1]);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreUnaligned32(dst + x, d);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m256i a = LoadAligned32(wiener_buffer);
+ const __m256i b = _mm256_add_epi16(a, _mm256_set1_epi16(8));
+ const __m256i c = _mm256_srai_epi16(b, 4);
+ const __m256i d = _mm256_max_epi16(c, _mm256_setzero_si256());
+ const __m256i e = _mm256_min_epi16(d, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 16;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 16;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ const __m256i coefficients_horizontal = _mm256_broadcastq_epi64(c);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ &coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, &coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of pixels in SIMD registers - (width % 8) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 4;
+constexpr int kOverreadInBytesPass2_128 = 8;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// The AVX2 ymm register holds ma[0], ma[1], ..., ma[7], and ma[16], ma[17],
+// ..., ma[23].
+// There is an 8 pixel gap between the first half and the second half.
+constexpr int kMaStoreOffset = 8;
+
+inline void StoreAligned32_ma(uint16_t* src, const __m256i v) {
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v, 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v, 1));
+}
+
+inline void StoreAligned64_ma(uint16_t* src, const __m256i v[2]) {
+ // The next 4 lines are much faster than:
+ // StoreAligned32(src + 0, _mm256_permute2x128_si256(v[0], v[1], 0x20));
+ // StoreAligned32(src + 16, _mm256_permute2x128_si256(v[0], v[1], 0x31));
+ StoreAligned16(src + 0 * 8, _mm256_extracti128_si256(v[0], 0));
+ StoreAligned16(src + 1 * 8, _mm256_extracti128_si256(v[1], 0));
+ StoreAligned16(src + 2 * 8, _mm256_extracti128_si256(v[0], 1));
+ StoreAligned16(src + 3 * 8, _mm256_extracti128_si256(v[1], 1));
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrU16(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi16(src0, _mm256_set1_epi16(1 << (src1 - 1)));
+ return _mm256_srli_epi16(sum, src1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+inline void Square(const __m256i src, __m256i dst[2]) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src, _mm256_setzero_si256());
+ dst[0] = _mm256_madd_epi16(s0, s0);
+ dst[1] = _mm256_madd_epi16(s1, s1);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare3_32(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline void Prepare5_32(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_32(const __m256i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline __m256i Sum5_32(const __m256i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline __m256i Sum3Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ return Sum3_16(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline __m256i Sum5Horizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ return Sum5_16(s);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3, __m256i* const row5) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 2);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 4);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 6);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 8);
+ const __m256i sum04 = _mm256_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm256_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const uint16_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ SumHorizontal16(src + 0, over_read_in_bytes + 0, row3_0, row5_0);
+ SumHorizontal16(src + 16, over_read_in_bytes + 32, row3_1, row5_1);
+}
+
+inline void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = _mm256_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal32(const __m256i src[3], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum3Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+inline void Sum5Horizontal32(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return _mm256_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565(const __m256i src[3]) {
+ const __m256i sum = Sum3_32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return _mm256_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m256i src[3], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass1_128 - sizeof(*src) * width;
+ const ptrdiff_t overread_in_bytes_256 =
+ kOverreadInBytesPass1_256 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s0[2], sq_128[4], s3, s5, sq3[2], sq5[2];
+ __m256i sq[8];
+ s0[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128 + 0);
+ Square(s0[1], sq_128 + 2);
+ SumHorizontal16(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal32(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8),
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 4, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int overread_in_bytes_128, overread_in_bytes_256;
+ if (size == 3) {
+ overread_in_bytes_128 = kOverreadInBytesPass2_128;
+ overread_in_bytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ overread_in_bytes_128 = kOverreadInBytesPass1_128;
+ overread_in_bytes_256 = kOverreadInBytesPass1_256;
+ }
+ overread_in_bytes_128 -= sizeof(*src) * width;
+ overread_in_bytes_256 -= sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s_128[2], ss, sq_128[4], sqs[2];
+ __m256i sq[8];
+ s_128[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128);
+ s_128[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s_128[0], sq_128 + 0);
+ Square(s_128[1], sq_128 + 2);
+ if (size == 3) {
+ ss = Sum3Horizontal16(s_128);
+ Sum3Horizontal32(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal16(s_128);
+ Sum5Horizontal32(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i s[2], row[2], row_sq[4];
+ s[0] = LoadUnaligned32Msan(
+ src + 8, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ s[1] = LoadUnaligned32Msan(
+ src + 24,
+ overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 24));
+ Square(s[0], sq + 2);
+ Square(s[1], sq + 6);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum3Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 4, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(
+ src, overread_in_bytes_256 + sizeof(*src) * (sum_width - x + 8));
+ row[1] =
+ Sum5Horizontal16(src + 16, overread_in_bytes_256 +
+ sizeof(*src) * (sum_width - x + 24));
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 4, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i b = VrshrU16(sum, 2);
+ const __m256i sum_lo = _mm256_unpacklo_epi16(b, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(b, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB5(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateB3(const __m256i sum, const __m256i ma, __m256i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b0[2], __m256i b1[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]); // 0 2 1 3
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x63); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ __m256i sums[2];
+ sums[0] = _mm256_permute2x128_si256(sum[0], sum[1], 0x20);
+ sums[1] = _mm256_permute2x128_si256(sum[0], sum[1], 0x31);
+ if (n == 9) {
+ CalculateB3(sums[0], maq0, b0);
+ CalculateB3(sums[1], maq1, b1);
+ } else {
+ CalculateB5(sums[0], maq0, b0);
+ CalculateB5(sums[1], maq1, b1);
+ }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[3], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm256_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm256_add_epi32(sum_b343[1], b[1]);
+ StoreAligned64(b444 + x, sum_b444);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32_ma(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32_ma(ma343 + x, *sum_ma343);
+ Store343_444(b3, x + kMaStoreOffset, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+// Don't combine the following 2 functions, which would be slower.
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi,
+ __m256i* const sum_ma444_lo,
+ __m256i* const sum_ma444_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], __m256i sum_b444_lo[2],
+ __m256i sum_b444_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_mat343[2], sum_mat444[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_mat444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_mat444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444_lo, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_mat444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ *sum_ma444_lo = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x20);
+ *sum_ma444_hi = _mm256_permute2x128_si256(sum_mat444[0], sum_mat444[1], 0x31);
+ StoreAligned32(ma444 + x + 0, *sum_ma444_lo);
+ StoreAligned32(ma444 + x + 16, *sum_ma444_hi);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_mat444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444_hi, b343, b444);
+}
+
+inline void Store343_444(const __m256i ma3[3], const __m256i b3[6],
+ const ptrdiff_t x, __m256i* const sum_ma343_lo,
+ __m256i* const sum_ma343_hi, __m256i sum_b343_lo[2],
+ __m256i sum_b343_hi[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444[2], sum_b444[2], sum_mat343[2];
+ const __m256i sum_ma111_lo = Sum3WLo16(ma3);
+ sum_ma444[0] = _mm256_slli_epi16(sum_ma111_lo, 2);
+ const __m256i sum333_lo = _mm256_sub_epi16(sum_ma444[0], sum_ma111_lo);
+ sum_mat343[0] = VaddwLo8(sum333_lo, ma3[1]);
+ Store343_444(b3, x, sum_b343_lo, sum_b444, b343, b444);
+ const __m256i sum_ma111_hi = Sum3WHi16(ma3);
+ sum_ma444[1] = _mm256_slli_epi16(sum_ma111_hi, 2);
+ StoreAligned64_ma(ma444 + x, sum_ma444);
+ const __m256i sum333_hi = _mm256_sub_epi16(sum_ma444[1], sum_ma111_hi);
+ sum_mat343[1] = VaddwHi8(sum333_hi, ma3[1]);
+ *sum_ma343_lo = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x20);
+ *sum_ma343_hi = _mm256_permute2x128_si256(sum_mat343[0], sum_mat343[1], 0x31);
+ StoreAligned32(ma343 + x + 0, *sum_ma343_lo);
+ StoreAligned32(ma343 + x + 16, *sum_ma343_hi);
+ Store343_444(b3 + 3, x + 16, sum_b343_hi, sum_b444, b343, b444);
+}
+
+inline void PermuteB(const __m256i t[4], __m256i b[7]) {
+ // Input:
+ // 0 1 2 3 // b[0]
+ // 4 5 6 7 // b[1]
+ // 8 9 10 11 24 25 26 27 // t[0]
+ // 12 13 14 15 28 29 30 31 // t[1]
+ // 16 17 18 19 32 33 34 35 // t[2]
+ // 20 21 22 23 36 37 38 39 // t[3]
+
+ // Output:
+ // 0 1 2 3 8 9 10 11 // b[0]
+ // 4 5 6 7 12 13 14 15 // b[1]
+ // 8 9 10 11 16 17 18 19 // b[2]
+ // 16 17 18 19 24 25 26 27 // b[3]
+ // 20 21 22 23 28 29 30 31 // b[4]
+ // 24 25 26 27 32 33 34 35 // b[5]
+ // 20 21 22 23 36 37 38 39 // b[6]
+ b[0] = _mm256_permute2x128_si256(b[0], t[0], 0x21);
+ b[1] = _mm256_permute2x128_si256(b[1], t[1], 0x21);
+ b[2] = _mm256_permute2x128_si256(t[0], t[2], 0x20);
+ b[3] = _mm256_permute2x128_si256(t[2], t[0], 0x30);
+ b[4] = _mm256_permute2x128_si256(t[3], t[1], 0x30);
+ b[5] = _mm256_permute2x128_si256(t[0], t[2], 0x31);
+ b[6] = t[3];
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][8], __m256i ma[3],
+ __m256i b[3]) {
+ __m256i s[2], s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src0 + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src0 + 16, over_read_in_bytes + 32);
+ s5[0][4] = Sum5Horizontal16(src1 + 0, over_read_in_bytes + 0);
+ s5[1][4] = Sum5Horizontal16(src1 + 16, over_read_in_bytes + 32);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s5[0][3] = Sum5Horizontal16(src + 0, over_read_in_bytes + 0);
+ s5[1][3] = Sum5Horizontal16(src + 16, over_read_in_bytes + 32);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[8],
+ __m256i ma[3], __m256i b[7]) {
+ __m256i s[2], s3[4], sq3[3][2], sum[2], index[2], t[4];
+ s[0] = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s[0], sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ s3[2] = Sum3Horizontal16(src, over_read_in_bytes);
+ s3[3] = Sum3Horizontal16(src + 16, over_read_in_bytes + 32);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[1], sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, t, t + 2);
+ PermuteB(t, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][3],
+ __m128i b3[2][10], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][8], __m256i ma3[2][3],
+ __m256i b3[2][7], __m256i ma5[3], __m256i b5[5]) {
+ __m256i s[2], s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2],
+ index_3[2][2], sum_5[2], index_5[2], t[4];
+ s[0] = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 16);
+ s[1] = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 16);
+ Square(s[0], sq[0] + 2);
+ Square(s[1], sq[1] + 2);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[0][1] = _mm256_permute2x128_si256(sq[0][1], sq[0][3], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ sq[1][1] = _mm256_permute2x128_si256(sq[1][1], sq[1][3], 0x21);
+ SumHorizontal16(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal16(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ s[0] = LoadUnaligned32Msan(src0 + 24, over_read_in_bytes + 48);
+ s[1] = LoadUnaligned32Msan(src1 + 24, over_read_in_bytes + 48);
+ Square(s[0], sq[0] + 6);
+ Square(s[1], sq[1] + 6);
+ sq[0][4] = _mm256_permute2x128_si256(sq[0][2], sq[0][6], 0x21);
+ sq[0][5] = _mm256_permute2x128_si256(sq[0][3], sq[0][7], 0x21);
+ sq[1][4] = _mm256_permute2x128_si256(sq[1][2], sq[1][6], 0x21);
+ sq[1][5] = _mm256_permute2x128_si256(sq[1][3], sq[1][7], 0x21);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], t, t + 2);
+ PermuteB(t, b3[0]);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], t, t + 2);
+ PermuteB(t, b3[1]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint16_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 16);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2], t[4];
+ Square(s0, sq + 2);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ sq[1] = _mm256_permute2x128_si256(sq[1], sq[3], 0x21);
+ SumHorizontal16(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ const __m256i s1 = LoadUnaligned32Msan(src + 24, over_read_in_bytes + 48);
+ Square(s1, sq + 6);
+ sq[4] = _mm256_permute2x128_si256(sq[2], sq[6], 0x21);
+ sq[5] = _mm256_permute2x128_si256(sq[3], sq[7], 0x21);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, t, t + 2);
+ PermuteB(t, b3);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, t, t + 2);
+ PermuteB(t, b5);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[10];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 3, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src) * width;
+ __m128i s[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes_128 + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes_128 + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(
+ src + x + 8, kOverreadInBytesPass2_256 + sizeof(*src) * (x + 8 - width),
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 3, kMaStoreOffset, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64_ma(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 3, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_128[3], sq_128[2][8], b3_128[2][10],
+ b5_128[10];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_128[0], b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_128[0], ma5_128[0]);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64_ma(ma343[0] + x, ma);
+ Sum343(b3[0], b);
+ Sum343(b3[0] + 3, b + 2);
+ StoreAligned64(b343[0] + x, b);
+ StoreAligned64(b343[0] + x + 16, b + 2);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 3, x + kMaStoreOffset, ma343[1], ma444,
+ b343[1], b444);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64_ma(ma565, ma);
+ Sum565(b5, b);
+ StoreAligned64(b565, b);
+ Sum565(b5 + 3, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m256i val) {
+ const __m256i val0 = _mm256_max_epi16(val, _mm256_setzero_si256());
+ const __m256i val1 = _mm256_min_epi16(val0, _mm256_set1_epi16(1023));
+ StoreUnaligned32(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][2], ma0, sq_128[2][4], b0[2];
+ __m256i mas[3], sq[2][8], bs[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, b0);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ ma[3] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[1]);
+ StoreAligned32(ma565[1] + x + 16, ma[3]);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ StoreAligned64(b565[1] + x, b[1]);
+ StoreAligned64(b565[1] + x + 16, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma + 2, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x + 0);
+ const __m256i p10 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p10, w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[3], b[3]);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ ClipAndStore(dst + stride + x + 16, d11);
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma0[2], sq_128[8], b0[6];
+ __m256i mas[3], sq[8], bs[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq_128, &ma0[0],
+ b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0[0], ma0[0]);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[4], b[4][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[2] = Sum565Lo(ma5);
+ ma[3] = Sum565Hi(ma5);
+ Sum565(bs + 0, b[1]);
+ Sum565(bs + 3, b[3]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x + 0);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x20);
+ LoadAligned64(b565 + x, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr0_lo, p0, w0);
+ ClipAndStore(dst + x + 0, d0);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma565 + x + 16);
+ ma[1] = _mm256_permute2x128_si256(ma[2], ma[3], 0x31);
+ LoadAligned64(b565 + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr0_hi, ma, b + 2);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr0_hi, p1, w0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes_128 =
+ kOverreadInBytesPass2_128 - sizeof(*src0) * width;
+ __m128i s0[2], ma0, sq_128[4], b0[2];
+ __m256i mas[3], sq[8], bs[7];
+ s0[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes_128 + 0);
+ s0[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes_128 + 16);
+ Square(s0[0], sq_128);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, b0);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0[0], b0[0]);
+ bs[1] = SetrM128i(b0[1], b0[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(
+ src0 + x + 8,
+ kOverreadInBytesPass2_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444(ma3, bs, x, &ma[2], &ma[3], b[2], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr_lo = LoadUnaligned32(src + x + 0);
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ mas[0] = mas[2];
+ bs[0] = bs[5];
+ bs[1] = bs[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3_128[2][3], ma5_0, sq_128[2][8], b3_128[2][10], b5_128[2];
+ __m256i ma3[2][3], ma5[3], sq[2][8], b3[2][7], b5[7];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq_128[0]);
+ Square(s[1][0], sq_128[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, b5_128);
+ sq[0][0] = SetrM128i(sq_128[0][2], sq_128[0][2]);
+ sq[0][1] = SetrM128i(sq_128[0][3], sq_128[0][3]);
+ sq[1][0] = SetrM128i(sq_128[1][2], sq_128[1][2]);
+ sq[1][1] = SetrM128i(sq_128[1][3], sq_128[1][3]);
+ ma3[0][0] = SetrM128i(ma3_128[0][0], ma3_128[0][0]);
+ ma3[1][0] = SetrM128i(ma3_128[1][0], ma3_128[1][0]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0][0], b3_128[0][0]);
+ b3[0][1] = SetrM128i(b3_128[0][1], b3_128[0][1]);
+ b3[1][0] = SetrM128i(b3_128[1][0], b3_128[1][0]);
+ b3[1][1] = SetrM128i(b3_128[1][1], b3_128[1][1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[3][4], mat[3][3], b[3][3][2], bt[3][3][2], p[2][2], ma3x[2][3],
+ ma5x[3];
+ BoxFilterPreProcess(
+ src0 + x + 8, src1 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width, sq, ma3, b3,
+ ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444(ma3x[0], b3[0], x, &ma[1][2], &mat[1][2], &ma[2][1],
+ &mat[2][1], b[1][2], bt[1][2], b[2][1], bt[2][1], ma343[2],
+ ma444[1], b343[2], b444[1]);
+ Store343_444(ma3x[1], b3[1], x, &ma[2][2], &mat[2][2], b[2][2], bt[2][2],
+ ma343[3], ma444[2], b343[3], b444[2]);
+
+ ma[0][2] = Sum565Lo(ma5x);
+ ma[0][3] = Sum565Hi(ma5x);
+ ma[0][1] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x20);
+ ma[0][3] = _mm256_permute2x128_si256(ma[0][2], ma[0][3], 0x31);
+ StoreAligned32(ma565[1] + x + 0, ma[0][1]);
+ StoreAligned32(ma565[1] + x + 16, ma[0][3]);
+ Sum565(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0_lo = LoadUnaligned32(src + x);
+ const __m256i sr1_lo = LoadUnaligned32(src + stride + x);
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ // Keeping the following 4 redundant lines is faster. The reason is that
+ // there are not enough registers available, and these values could be saved
+ // and loaded which is even slower.
+ ma[1][2] = LoadAligned32(ma343[2] + x); // Redundant line 1.
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ ma[2][1] = LoadAligned32(ma444[1] + x); // Redundant line 2.
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ClipAndStore(dst + x, d00);
+ const __m256i d10x = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+ ClipAndStore(dst + stride + x, d10x);
+
+ Sum565(b5 + 3, bt[0][1]);
+ StoreAligned64(b565[1] + x + 16, bt[0][1]);
+ const __m256i sr0_hi = LoadUnaligned32(src + x + 16);
+ const __m256i sr1_hi = LoadUnaligned32(src + stride + x + 16);
+ ma[0][2] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, bt[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0] + 2, bt[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][3], bt[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ mat[1][2] = LoadAligned32(ma343[2] + x + 16); // Redundant line 3.
+ LoadAligned64(b343[0] + x + 16, bt[1][0]);
+ LoadAligned64(b444[0] + x + 16, bt[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], bt[1]);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ mat[2][1] = LoadAligned32(ma444[1] + x + 16); // Redundant line 4.
+ LoadAligned64(b343[1] + x + 16, bt[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], bt[2]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 16, d01);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 16, d11);
+
+ sq[0][0] = sq[0][6];
+ sq[0][1] = sq[0][7];
+ sq[1][0] = sq[1][6];
+ sq[1][1] = sq[1][7];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][5];
+ b3[0][1] = b3[0][6];
+ b3[1][0] = b3[1][5];
+ b3[1][1] = b3[1][6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1_128 - sizeof(*src0) * width;
+ __m128i s[2], ma3_0, ma5_0, sq_128[4], b3_128[2], b5_128[2];
+ __m256i ma3[3], ma5[3], sq[8], b3[7], b5[7];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq_128);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, b3_128, b5_128);
+ sq[0] = SetrM128i(sq_128[2], sq_128[2]);
+ sq[1] = SetrM128i(sq_128[3], sq_128[3]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_128[0], b5_128[0]);
+ b5[1] = SetrM128i(b5_128[1], b5_128[1]);
+
+ int x = 0;
+ do {
+ __m256i ma[4], mat[4], b[3][2], bt[3][2], ma3x[3], ma5x[3], p[2];
+ BoxFilterPreProcessLastRow(
+ src0 + x + 8,
+ kOverreadInBytesPass1_256 + sizeof(*src0) * (x + 8 - width), sum_width,
+ x + 8, scales, sum3, sum5, square_sum3, square_sum5, sq, ma3, ma5, b3,
+ b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[2] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ mat[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 3, bt[1]);
+ ma[3] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 3, bt[2]);
+
+ const __m256i sr_lo = LoadUnaligned32(src + x);
+ ma[0] = LoadAligned32(ma565 + x);
+ ma[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x20);
+ mat[1] = _mm256_permute2x128_si256(ma[2], mat[1], 0x31);
+ LoadAligned64(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ ma[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x20);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ const __m256i sr_hi = LoadUnaligned32(src + x + 16);
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, bt[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, bt);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ mat[2] = _mm256_permute2x128_si256(ma[3], mat[2], 0x31);
+ LoadAligned64(b343 + x + 16, bt[0]);
+ LoadAligned64(b444 + x + 16, bt[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, bt);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 16, d1);
+
+ sq[0] = sq[6];
+ sq[1] = sq[7];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[5];
+ b3[1] = b3[6];
+ b5[0] = b5[5];
+ b5[1] = b5[6];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_10BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_AVX2() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc
new file mode 100644
index 0000000..96380e3
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_10bit_sse4.cc
@@ -0,0 +1,2530 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_sse4.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace {
+
+inline void WienerHorizontalClip(const __m128i s[2],
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (10 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit = (offset << 2) - 1;
+ const __m128i offsets = _mm_set1_epi16(-offset);
+ const __m128i limits = _mm_set1_epi16(limit - offset);
+ const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsHorizontal - 1));
+ const __m128i sum0 = _mm_add_epi32(s[0], round);
+ const __m128i sum1 = _mm_add_epi32(s[1], round);
+ const __m128i rounded_sum0 = _mm_srai_epi32(sum0, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum1 = _mm_srai_epi32(sum1, kInterRoundBitsHorizontal);
+ const __m128i rounded_sum = _mm_packs_epi32(rounded_sum0, rounded_sum1);
+ const __m128i d0 = _mm_max_epi16(rounded_sum, offsets);
+ const __m128i d1 = _mm_min_epi16(d0, limits);
+ StoreAligned16(wiener_buffer, d1);
+}
+
+inline void WienerHorizontalTap7(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ __m128i filter[2];
+ filter[0] = _mm_shuffle_epi32(coefficients, 0x0);
+ filter[1] = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[7], madds[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ const __m128i s06 = _mm_add_epi16(s[0], s[6]);
+ const __m128i s15 = _mm_add_epi16(s[1], s[5]);
+ const __m128i s24 = _mm_add_epi16(s[2], s[4]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s06, s15);
+ const __m128i ss1 = _mm_unpackhi_epi16(s06, s15);
+ const __m128i ss2 = _mm_unpacklo_epi16(s24, s[3]);
+ const __m128i ss3 = _mm_unpackhi_epi16(s24, s[3]);
+ madds[0] = _mm_madd_epi16(ss0, filter[0]);
+ madds[1] = _mm_madd_epi16(ss1, filter[0]);
+ madds[2] = _mm_madd_epi16(ss2, filter[1]);
+ madds[3] = _mm_madd_epi16(ss3, filter[1]);
+ madds[0] = _mm_add_epi32(madds[0], madds[2]);
+ madds[1] = _mm_add_epi32(madds[1], madds[3]);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const __m128i filter =
+ _mm_shuffle_epi8(coefficients, _mm_set1_epi32(0x05040302));
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[5], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ const __m128i s04 = _mm_add_epi16(s[0], s[4]);
+ const __m128i s13 = _mm_add_epi16(s[1], s[3]);
+ const __m128i s2d = _mm_add_epi16(s[2], s[2]);
+ const __m128i s0m = _mm_sub_epi16(s04, s2d);
+ const __m128i s1m = _mm_sub_epi16(s13, s2d);
+ const __m128i ss0 = _mm_unpacklo_epi16(s0m, s1m);
+ const __m128i ss1 = _mm_unpackhi_epi16(s0m, s1m);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ const __m128i s2_lo = _mm_unpacklo_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2_hi = _mm_unpackhi_epi16(s[2], _mm_setzero_si128());
+ const __m128i s2x128_lo = _mm_slli_epi32(s2_lo, 7);
+ const __m128i s2x128_hi = _mm_slli_epi32(s2_hi, 7);
+ madds[0] = _mm_add_epi32(madds[0], s2x128_lo);
+ madds[1] = _mm_add_epi32(madds[1], s2x128_hi);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m128i coefficients,
+ int16_t** const wiener_buffer) {
+ const auto filter = _mm_shuffle_epi32(coefficients, 0x55);
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i s[3], madds[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ const __m128i s02 = _mm_add_epi16(s[0], s[2]);
+ const __m128i ss0 = _mm_unpacklo_epi16(s02, s[1]);
+ const __m128i ss1 = _mm_unpackhi_epi16(s02, s[1]);
+ madds[0] = _mm_madd_epi16(ss0, filter);
+ madds[1] = _mm_madd_epi16(ss1, filter);
+ WienerHorizontalClip(madds, *wiener_buffer + x);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint16_t* src,
+ const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m128i s = LoadUnaligned16(src + x);
+ const __m128i d = _mm_slli_epi16(s, 4);
+ StoreAligned16(*wiener_buffer + x, d);
+ x += 8;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m128i WienerVertical7(const __m128i a[4], const __m128i filter[4]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd3 = _mm_madd_epi16(a[3], filter[3]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i madd23 = _mm_add_epi32(madd2, madd3);
+ const __m128i sum = _mm_add_epi32(madd01, madd23);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical5(const __m128i a[3], const __m128i filter[3]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i madd2 = _mm_madd_epi16(a[2], filter[2]);
+ const __m128i madd01 = _mm_add_epi32(madd0, madd1);
+ const __m128i sum = _mm_add_epi32(madd01, madd2);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVertical3(const __m128i a[2], const __m128i filter[2]) {
+ const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
+ const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
+ const __m128i sum = _mm_add_epi32(madd0, madd1);
+ return _mm_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m128i WienerVerticalClip(const __m128i s[2]) {
+ const __m128i d = _mm_packus_epi32(s[0], s[1]);
+ return _mm_min_epu16(d, _mm_set1_epi16(1023));
+}
+
+inline __m128i WienerVerticalFilter7(const __m128i a[7],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[4], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], a[5]);
+ b[3] = _mm_unpacklo_epi16(a[6], round);
+ c[0] = WienerVertical7(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], a[5]);
+ b[3] = _mm_unpackhi_epi16(a[6], round);
+ c[1] = WienerVertical7(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter5(const __m128i a[5],
+ const __m128i filter[3]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[3], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], a[3]);
+ b[2] = _mm_unpacklo_epi16(a[4], round);
+ c[0] = WienerVertical5(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], a[3]);
+ b[2] = _mm_unpackhi_epi16(a[4], round);
+ c[1] = WienerVertical5(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalFilter3(const __m128i a[3],
+ const __m128i filter[2]) {
+ const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m128i b[2], c[2];
+ b[0] = _mm_unpacklo_epi16(a[0], a[1]);
+ b[1] = _mm_unpacklo_epi16(a[2], round);
+ c[0] = WienerVertical3(b, filter);
+ b[0] = _mm_unpackhi_epi16(a[0], a[1]);
+ b[1] = _mm_unpackhi_epi16(a[2], round);
+ c[1] = WienerVertical3(b, filter);
+ return WienerVerticalClip(c);
+}
+
+inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[7]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[3], __m128i a[5]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m128i filter[2], __m128i a[3]) {
+ a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[4];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi32(c, 0x55);
+ filter[2] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[3] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[8], d[2];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ a[7] = LoadAligned16(wiener_buffer + x + 7 * width);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[7];
+ const __m128i d =
+ WienerVerticalTap7Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i c = LoadLo8(coefficients);
+ __m128i filter[3];
+ filter[0] = _mm_shuffle_epi32(c, 0x0);
+ filter[1] = _mm_shuffle_epi8(c, _mm_set1_epi32(0x03020504));
+ filter[2] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[6], d[2];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ a[5] = LoadAligned16(wiener_buffer + x + 5 * width);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[5];
+ const __m128i d =
+ WienerVerticalTap5Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ __m128i filter[2];
+ filter[0] = _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ filter[1] =
+ _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[0]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[4], d[2];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ a[3] = LoadAligned16(wiener_buffer + x + 3 * width);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+ StoreAligned16(dst + x, d[0]);
+ StoreAligned16(dst + dst_stride + x, d[1]);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m128i a[3];
+ const __m128i d =
+ WienerVerticalTap3Kernel(wiener_buffer + x, width, filter, a);
+ StoreAligned16(dst + x, d);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint16_t* const dst) {
+ const __m128i a = LoadAligned16(wiener_buffer);
+ const __m128i b = _mm_add_epi16(a, _mm_set1_epi16(8));
+ const __m128i c = _mm_srai_epi16(b, 4);
+ const __m128i d = _mm_max_epi16(c, _mm_setzero_si128());
+ const __m128i e = _mm_min_epi16(d, _mm_set1_epi16(1023));
+ StoreAligned16(dst, e);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint16_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 8;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 8;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 16);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ const __m128i coefficients_horizontal =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint16_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+// SIMD overreads 8 - (width % 8) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 4;
+constexpr int kOverreadInBytesPass2 = 8;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m128i VmullNLo8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullNHi8(const __m128i src0, const int src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m128i VrshrU16(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi16(src0, _mm_set1_epi16(1 << (src1 - 1)));
+ return _mm_srli_epi16(sum, src1);
+}
+
+inline __m128i VrshrS32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline void Square(const __m128i src, __m128i dst[2]) {
+ const __m128i s0 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src, _mm_setzero_si128());
+ dst[0] = _mm_madd_epi16(s0, s0);
+ dst[1] = _mm_madd_epi16(s1, s1);
+}
+
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_32(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 4);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_32(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_32(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 12);
+ dst[4] = src[1];
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m128i Sum3_32(const __m128i src[3]) {
+ return Sum3_32(src[0], src[1], src[2]);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
+ const __m128i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m128i Sum5_32(const __m128i src[5]) {
+ return Sum5_32(&src[0], &src[1], &src[2], &src[3], &src[4]);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum3Horizontal16(const __m128i src[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ return Sum3_16(s);
+}
+
+inline void Sum3Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum3_32(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum3_32(s);
+}
+
+inline __m128i Sum5Horizontal16(const __m128i src[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ return Sum5_16(s);
+}
+
+inline void Sum5Horizontal32(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ dst[0] = Sum5_32(s);
+ Prepare5_32(src + 1, s);
+ dst[1] = Sum5_32(s);
+}
+
+void SumHorizontal16(const __m128i src[2], __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum04 = _mm_add_epi16(s[0], s[4]);
+ *row3 = Sum3_16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal16(const __m128i src[3], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ SumHorizontal16(src + 0, row3_0, row5_0);
+ SumHorizontal16(src + 1, row3_1, row5_1);
+}
+
+void SumHorizontal32(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = _mm_add_epi32(src[0], src[4]);
+ *row_sq3 = Sum3_32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+inline void SumHorizontal32(const __m128i src[3], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_32(src + 0, s);
+ SumHorizontal32(s, row_sq3_0, row_sq5_0);
+ Prepare5_32(src + 1, s);
+ SumHorizontal32(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return _mm_add_epi32(sum3, src[1]);
+}
+
+inline void Sum343(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum343(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum343(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565(const __m128i src[3]) {
+ const __m128i sum = Sum3_32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return _mm_add_epi32(sum5, src[1]);
+}
+
+inline void Sum565(const __m128i src[3], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_32(src + 0, s);
+ dst[0] = Sum565(s);
+ Prepare3_32(src + 1, s);
+ dst[1] = Sum565(s);
+}
+
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ SumHorizontal16(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal32(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal32(sq + 2, &row_sq3[0], &row_sq3[1], &row_sq5[0],
+ &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint16_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ const ptrdiff_t overread_in_bytes =
+ ((size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2) -
+ sizeof(*src) * width;
+ int y = 2;
+ do {
+ __m128i s[3], sq[6];
+ s[0] = LoadUnaligned16Msan(src, overread_in_bytes);
+ Square(s[0], sq);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ s[1] = LoadUnaligned16Msan(
+ src + 8, overread_in_bytes + sizeof(*src) * (sum_width - x + 8));
+ x -= 16;
+ src += 16;
+ s[2] = LoadUnaligned16Msan(
+ src, overread_in_bytes + sizeof(*src) * (sum_width - x));
+ Square(s[1], sq + 2);
+ Square(s[2], sq + 4);
+ if (size == 3) {
+ row[0] = Sum3Horizontal16(s + 0);
+ row[1] = Sum3Horizontal16(s + 1);
+ Sum3Horizontal32(sq + 0, row_sq + 0);
+ Sum3Horizontal32(sq + 2, row_sq + 2);
+ } else {
+ row[0] = Sum5Horizontal16(s + 0);
+ row[1] = Sum5Horizontal16(s + 1);
+ Sum5Horizontal32(sq + 0, row_sq + 0);
+ Sum5Horizontal32(sq + 2, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[2];
+ sq[0] = sq[4];
+ sq[1] = sq[5];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i b = VrshrU16(sum, 2);
+ const __m128i sum_lo = _mm_unpacklo_epi16(b, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(b, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, VrshrU32(sum_sq[0], 4), scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, VrshrU32(sum_sq[1], 4), scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+inline void CalculateB5(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ b[0] = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ b[1] = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+}
+
+inline void CalculateB3(const __m128i sum, const __m128i ma, __m128i b[2]) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ b[0] = VrshrU32(m2, kSgrProjReciprocalBits);
+ b[1] = VrshrU32(m3, kSgrProjReciprocalBits);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ if (n == 9) {
+ CalculateB3(sum, maq, b);
+ } else {
+ CalculateB5(sum, maq, b);
+ }
+}
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
+}
+
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i b0[2],
+ __m128i b1[2]) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[0], maq0, b0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ CalculateB3(sum[1], maq1, b1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[4]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, b + 0, b + 2);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i b[2]) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[3], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_32(b3 + 0, b);
+ sum_b111[0] = Sum3_32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[0] = _mm_add_epi32(sum_b343[0], b[1]);
+ Prepare3_32(b3 + 1, b);
+ sum_b111[1] = Sum3_32(b);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[1] = _mm_add_epi32(sum_b343[1], b[1]);
+ StoreAligned32U32(b444 + x, sum_b444);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[3],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][4], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ s5[0][3] = Sum5Horizontal16(s[0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal16(s[1]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5Horizontal32(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5Horizontal32(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ s5[0][3] = Sum5Horizontal16(s[0] + 1);
+ s5[1][3] = Sum5Horizontal16(s[0] + 2);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ s5[0][4] = Sum5Horizontal16(s[1] + 1);
+ s5[1][4] = Sum5Horizontal16(s[1] + 2);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5Horizontal32(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5Horizontal32(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ Sum5Horizontal32(sq[0] + 4, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5Horizontal32(sq[1] + 4, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s[2], const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s5[5], sq5[5][2];
+ Square(s[1], sq + 2);
+ s5[3] = s5[4] = Sum5Horizontal16(s);
+ Sum5Horizontal32(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s5[2][5], sq5[5][2];
+ Square(s[2], sq + 4);
+ s5[0][3] = Sum5Horizontal16(s + 1);
+ s5[1][3] = Sum5Horizontal16(s + 2);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5Horizontal32(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], b + 2);
+
+ Square(s[3], sq + 6);
+ Sum5Horizontal32(sq + 4, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], b + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s[2], const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i* const ma,
+ __m128i b[2]) {
+ __m128i s3[3], sq3[3][2];
+ Square(s[1], sq + 2);
+ s3[2] = Sum3Horizontal16(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3Horizontal32(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[4], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[8], __m128i ma[2],
+ __m128i b[6]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ s3[2] = Sum3Horizontal16(s + 1);
+ s3[3] = Sum3Horizontal16(s + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3Horizontal32(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ Sum3Horizontal32(sq + 4, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 2);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][4], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i* const ma5, __m128i b5[2]) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ Square(s[0][1], sq[0] + 2);
+ Square(s[1][1], sq[1] + 2);
+ SumHorizontal16(s[0], &s3[2], &s5[3]);
+ SumHorizontal16(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal32(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal32(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], b3[0], b3[1]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][4], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][8], __m128i ma3[2][2],
+ __m128i b3[2][6], __m128i ma5[2], __m128i b5[6]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal16(s[0] + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal16(s[1] + 1, &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Square(s[0][2], sq[0] + 4);
+ Square(s[1][2], sq[1] + 4);
+ SumHorizontal32(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal32(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], b5 + 2);
+
+ Square(s[0][3], sq[0] + 6);
+ Square(s[1][3], sq[1] + 6);
+ SumHorizontal32(sq[0] + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal32(sq[1] + 4, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 2);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 2);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], b5 + 4);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s[2], const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i* const ma3,
+ __m128i* const ma5, __m128i b3[2], __m128i b5[2]) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ Square(s[1], sq + 2);
+ SumHorizontal16(s, &s3[2], &s5[3]);
+ SumHorizontal32(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[4], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[8], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[6], __m128i b5[6]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ Square(s[2], sq + 4);
+ SumHorizontal16(s + 1, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal32(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 2);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ Square(s[3], sq + 6);
+ SumHorizontal32(sq + 4, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 4);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 2);
+}
+
+inline void BoxSumFilterPreProcess5(const uint16_t* const src0,
+ const uint16_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565(bs + 0, b + 0);
+ Sum565(bs + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint16_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src + x + 16,
+ overread_in_bytes + sizeof(*src) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src + x + 24,
+ overread_in_bytes + sizeof(*src) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 2, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343(bs + 0, b + 0);
+ Sum343(bs + 2, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint16_t* const src0, const uint16_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343(b3[0] + 0, b + 0);
+ Sum343(b3[0] + 2, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565(b5 + 0, b + 0);
+ Sum565(b5 + 2, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 2, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m128i v = _mm_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
+ const __m128i b[2]) {
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
+ const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
+ __m128i b_sum[2];
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
+ __m128i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
+ const __m128i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
+ return _mm_add_epi16(src, vv);
+}
+
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
+ __m128i v[2];
+ const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
+ const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m128i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+inline void ClipAndStore(uint16_t* const dst, const __m128i val) {
+ const __m128i val0 = _mm_max_epi16(val, _mm_setzero_si128());
+ const __m128i val1 = _mm_min_epi16(val0, _mm_set1_epi16(1023));
+ StoreAligned16(dst, val1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], mas[2], sq[2][8], bs[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2], p[2];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ const __m128i sr0_lo = LoadAligned16(src + x + 0);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x + 0);
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
+
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565(bs + 2, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess5LastRowLo(s, scale, sum5, square_sum5, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ __m128i ma[2], ma5[3], b[2][2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
+
+ ma[1] = Sum565Hi(ma5);
+ Sum565(bs + 2, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass2 - sizeof(*src0) * width;
+ __m128i s[4], mas[2], sq[8], bs[6];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq, &mas[0], bs);
+
+ int x = 0;
+ do {
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 2, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ mas[0] = mas[1];
+ bs[0] = bs[4];
+ bs[1] = bs[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint16_t* const src, const uint16_t* const src0,
+ const uint16_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[2][4], ma3[2][2], ma5[2], sq[2][8], b3[2][6], b5[6];
+ s[0][0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[0][1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ s[1][0] = LoadUnaligned16Msan(src1 + 0, overread_in_bytes + 0);
+ s[1][1] = LoadUnaligned16Msan(src1 + 8, overread_in_bytes + 16);
+ Square(s[0][0], sq[0]);
+ Square(s[1][0], sq[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], b5);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[0][3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ s[1][2] = LoadUnaligned16Msan(src1 + x + 16,
+ overread_in_bytes + sizeof(*src1) * (x + 16));
+ s[1][3] = LoadUnaligned16Msan(src1 + x + 24,
+ overread_in_bytes + sizeof(*src1) * (x + 24));
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0_lo = LoadAligned16(src + x);
+ const __m128i sr1_lo = LoadAligned16(src + stride + x);
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 2, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 2, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565(b5 + 2, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = LoadAligned16(src + x + 8);
+ const __m128i sr1_hi = LoadAligned16(src + stride + x + 8);
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ ClipAndStore(dst + x + 0, d00);
+ ClipAndStore(dst + x + 8, d01);
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ ClipAndStore(dst + stride + x + 0, d10);
+ ClipAndStore(dst + stride + x + 8, d11);
+ s[0][0] = s[0][2];
+ s[0][1] = s[0][3];
+ s[1][0] = s[1][2];
+ s[1][1] = s[1][3];
+ sq[0][2] = sq[0][6];
+ sq[0][3] = sq[0][7];
+ sq[1][2] = sq[1][6];
+ sq[1][3] = sq[1][7];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][4];
+ b3[0][1] = b3[0][5];
+ b3[1][0] = b3[1][4];
+ b3[1][1] = b3[1][5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint16_t* const src, const uint16_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint16_t* const dst) {
+ const ptrdiff_t overread_in_bytes =
+ kOverreadInBytesPass1 - sizeof(*src0) * width;
+ __m128i s[4], ma3[2], ma5[2], sq[8], b3[6], b5[6], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0 + 0, overread_in_bytes + 0);
+ s[1] = LoadUnaligned16Msan(src0 + 8, overread_in_bytes + 16);
+ Square(s[0], sq);
+ BoxFilterPreProcessLastRowLo(s, scales, sum3, sum5, square_sum3, square_sum5,
+ sq, &ma3[0], &ma5[0], b3, b5);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[2] = LoadUnaligned16Msan(src0 + x + 16,
+ overread_in_bytes + sizeof(*src0) * (x + 16));
+ s[3] = LoadUnaligned16Msan(src0 + x + 24,
+ overread_in_bytes + sizeof(*src0) * (x + 24));
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343(b3, b[2]);
+ const __m128i sr_lo = LoadAligned16(src + x + 0);
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565(b5 + 2, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343(b3 + 2, b[2]);
+ const __m128i sr_hi = LoadAligned16(src + x + 8);
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ ClipAndStore(dst + x + 0, d0);
+ ClipAndStore(dst + x + 8, d1);
+ s[1] = s[3];
+ sq[2] = sq[6];
+ sq[3] = sq[7];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[4];
+ b3[1] = b3[5];
+ b5[0] = b5[4];
+ b5[1] = b5[5];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint16_t* src,
+ const ptrdiff_t stride, const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint16_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint16_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint16_t* src, const ptrdiff_t stride,
+ const uint16_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint16_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint16_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint16_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint16_t*>(source);
+ const auto* const top = static_cast<const uint16_t*>(top_border);
+ const auto* const bottom = static_cast<const uint16_t*>(bottom_border);
+ auto* const dst = static_cast<uint16_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
+#endif
+}
+
+} // namespace
+
+void LoopRestorationInit10bpp_SSE4_1() { Init10bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !(LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10)
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit10bpp_SSE4_1() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_SSE4_1 && LIBGAV1_MAX_BITDEPTH >= 10
diff --git a/libgav1/src/dsp/x86/loop_restoration_avx2.cc b/libgav1/src/dsp/x86/loop_restoration_avx2.cc
new file mode 100644
index 0000000..351a324
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_avx2.cc
@@ -0,0 +1,2941 @@
+// Copyright 2020 The libgav1 Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "src/dsp/loop_restoration.h"
+#include "src/utils/cpu.h"
+
+#if LIBGAV1_TARGETING_AVX2
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "src/dsp/common.h"
+#include "src/dsp/constants.h"
+#include "src/dsp/dsp.h"
+#include "src/dsp/x86/common_avx2.h"
+#include "src/utils/common.h"
+#include "src/utils/constants.h"
+
+namespace libgav1 {
+namespace dsp {
+namespace low_bitdepth {
+namespace {
+
+inline void WienerHorizontalClip(const __m256i s[2], const __m256i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
+ 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
+ const __m256i offsets = _mm256_set1_epi16(-offset);
+ const __m256i limits = _mm256_set1_epi16(limit - offset);
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
+ // The sum range here is [-128 * 255, 90 * 255].
+ const __m256i madd = _mm256_add_epi16(s[0], s[1]);
+ const __m256i sum = _mm256_add_epi16(madd, round);
+ const __m256i rounded_sum0 =
+ _mm256_srai_epi16(sum, kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
+ const __m256i rounded_sum1 = _mm256_add_epi16(rounded_sum0, s_3x128);
+ const __m256i d0 = _mm256_max_epi16(rounded_sum1, offsets);
+ const __m256i d1 = _mm256_min_epi16(d0, limits);
+ StoreAligned32(wiener_buffer, d1);
+}
+
+// Using _mm256_alignr_epi8() is about 8% faster than loading all and unpacking,
+// because the compiler generates redundant code when loading all and unpacking.
+inline void WienerHorizontalTap7Kernel(const __m256i s[2],
+ const __m256i filter[4],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ const auto s67 = _mm256_alignr_epi8(s[1], s[0], 13);
+ __m256i madds[4];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[3] = _mm256_maddubs_epi16(s67, filter[3]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm256_add_epi16(madds[1], madds[3]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s23, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m256i s[2],
+ const __m256i filter[3],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ const auto s45 = _mm256_alignr_epi8(s[1], s[0], 9);
+ __m256i madds[3];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ madds[2] = _mm256_maddubs_epi16(s45, filter[2]);
+ madds[0] = _mm256_add_epi16(madds[0], madds[2]);
+ const __m256i s_3x128 = _mm256_srli_epi16(_mm256_slli_epi16(s23, 8),
+ kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap3Kernel(const __m256i s[2],
+ const __m256i filter[2],
+ int16_t* const wiener_buffer) {
+ const auto s01 = _mm256_alignr_epi8(s[1], s[0], 1);
+ const auto s23 = _mm256_alignr_epi8(s[1], s[0], 5);
+ __m256i madds[2];
+ madds[0] = _mm256_maddubs_epi16(s01, filter[0]);
+ madds[1] = _mm256_maddubs_epi16(s23, filter[1]);
+ const __m256i s_3x128 = _mm256_slli_epi16(_mm256_srli_epi16(s01, 8),
+ 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[4];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0100));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[2] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0102));
+ filter[3] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8000)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[3];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0201));
+ filter[1] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0203));
+ filter[2] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8001)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ const __m256i coefficients,
+ int16_t** const wiener_buffer) {
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi8(coefficients, _mm256_set1_epi16(0x0302));
+ filter[1] = _mm256_shuffle_epi8(
+ coefficients, _mm256_set1_epi16(static_cast<int16_t>(0x8002)));
+ for (int y = height; y != 0; --y) {
+ __m256i s = LoadUnaligned32(src);
+ __m256i ss[4];
+ ss[0] = _mm256_unpacklo_epi8(s, s);
+ ptrdiff_t x = 0;
+ do {
+ ss[1] = _mm256_unpackhi_epi8(s, s);
+ s = LoadUnaligned32(src + x + 32);
+ ss[3] = _mm256_unpacklo_epi8(s, s);
+ ss[2] = _mm256_permute2x128_si256(ss[0], ss[3], 0x21);
+ WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
+ WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 16);
+ ss[0] = ss[3];
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const int height,
+ int16_t** const wiener_buffer) {
+ for (int y = height; y != 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ const __m256i s = LoadUnaligned32(src + x);
+ const __m256i s0 = _mm256_unpacklo_epi8(s, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(s, _mm256_setzero_si256());
+ __m256i d[2];
+ d[0] = _mm256_slli_epi16(s0, 4);
+ d[1] = _mm256_slli_epi16(s1, 4);
+ StoreAligned64(*wiener_buffer + x, d);
+ x += 32;
+ } while (x < width);
+ src += src_stride;
+ *wiener_buffer += width;
+ }
+}
+
+inline __m256i WienerVertical7(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum0 = _mm256_add_epi32(round, madd0);
+ const __m256i sum1 = _mm256_add_epi32(sum0, madd1);
+ return _mm256_srai_epi32(sum1, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical5(const __m256i a[2], const __m256i filter[2]) {
+ const __m256i madd0 = _mm256_madd_epi16(a[0], filter[0]);
+ const __m256i madd1 = _mm256_madd_epi16(a[1], filter[1]);
+ const __m256i sum = _mm256_add_epi32(madd0, madd1);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVertical3(const __m256i a, const __m256i filter) {
+ const __m256i round = _mm256_set1_epi32(1 << (kInterRoundBitsVertical - 1));
+ const __m256i madd = _mm256_madd_epi16(a, filter);
+ const __m256i sum = _mm256_add_epi32(round, madd);
+ return _mm256_srai_epi32(sum, kInterRoundBitsVertical);
+}
+
+inline __m256i WienerVerticalFilter7(const __m256i a[7],
+ const __m256i filter[2]) {
+ __m256i b[2];
+ const __m256i a06 = _mm256_add_epi16(a[0], a[6]);
+ const __m256i a15 = _mm256_add_epi16(a[1], a[5]);
+ const __m256i a24 = _mm256_add_epi16(a[2], a[4]);
+ b[0] = _mm256_unpacklo_epi16(a06, a15);
+ b[1] = _mm256_unpacklo_epi16(a24, a[3]);
+ const __m256i sum0 = WienerVertical7(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a06, a15);
+ b[1] = _mm256_unpackhi_epi16(a24, a[3]);
+ const __m256i sum1 = WienerVertical7(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter5(const __m256i a[5],
+ const __m256i filter[2]) {
+ const __m256i round = _mm256_set1_epi16(1 << (kInterRoundBitsVertical - 1));
+ __m256i b[2];
+ const __m256i a04 = _mm256_add_epi16(a[0], a[4]);
+ const __m256i a13 = _mm256_add_epi16(a[1], a[3]);
+ b[0] = _mm256_unpacklo_epi16(a04, a13);
+ b[1] = _mm256_unpacklo_epi16(a[2], round);
+ const __m256i sum0 = WienerVertical5(b, filter);
+ b[0] = _mm256_unpackhi_epi16(a04, a13);
+ b[1] = _mm256_unpackhi_epi16(a[2], round);
+ const __m256i sum1 = WienerVertical5(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalFilter3(const __m256i a[3], const __m256i filter) {
+ __m256i b;
+ const __m256i a02 = _mm256_add_epi16(a[0], a[2]);
+ b = _mm256_unpacklo_epi16(a02, a[1]);
+ const __m256i sum0 = WienerVertical3(b, filter);
+ b = _mm256_unpackhi_epi16(a02, a[1]);
+ const __m256i sum1 = WienerVertical3(b, filter);
+ return _mm256_packs_epi32(sum0, sum1);
+}
+
+inline __m256i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[7]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ a[6] = LoadAligned32(wiener_buffer + 6 * wiener_stride);
+ return WienerVerticalFilter7(a, filter);
+}
+
+inline __m256i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i a[5]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ a[4] = LoadAligned32(wiener_buffer + 4 * wiener_stride);
+ return WienerVerticalFilter5(a, filter);
+}
+
+inline __m256i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i a[3]) {
+ a[0] = LoadAligned32(wiener_buffer + 0 * wiener_stride);
+ a[1] = LoadAligned32(wiener_buffer + 1 * wiener_stride);
+ a[2] = LoadAligned32(wiener_buffer + 2 * wiener_stride);
+ return WienerVerticalFilter3(a, filter);
+}
+
+inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[8];
+ d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[7] = LoadAligned32(wiener_buffer + 7 * wiener_stride);
+ d[1] = WienerVerticalFilter7(a + 1, filter);
+}
+
+inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter[2], __m256i d[2]) {
+ __m256i a[6];
+ d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[5] = LoadAligned32(wiener_buffer + 5 * wiener_stride);
+ d[1] = WienerVerticalFilter5(a + 1, filter);
+}
+
+inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
+ const ptrdiff_t wiener_stride,
+ const __m256i filter, __m256i d[2]) {
+ __m256i a[4];
+ d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
+ a[3] = LoadAligned32(wiener_buffer + 3 * wiener_stride);
+ d[1] = WienerVerticalFilter3(a + 1, filter);
+}
+
+inline void WienerVerticalTap7(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[4], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastq_epi64(LoadLo8(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0x0);
+ filter[1] = _mm256_shuffle_epi32(c, 0x55);
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap7Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[7];
+ const __m256i d0 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap7Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap5(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[3], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i c = _mm256_broadcastd_epi32(Load4(coefficients));
+ __m256i filter[2];
+ filter[0] = _mm256_shuffle_epi32(c, 0);
+ filter[1] =
+ _mm256_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap5Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[5];
+ const __m256i d0 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap5Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap3(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ const int16_t coefficients[2], uint8_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m256i filter =
+ _mm256_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i d[2][2];
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
+ WienerVerticalTap3Kernel2(wiener_buffer + x + 16, width, filter, d[1]);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d[0][0], d[1][0]));
+ StoreUnaligned32(dst + dst_stride + x,
+ _mm256_packus_epi16(d[0][1], d[1][1]));
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ __m256i a[3];
+ const __m256i d0 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
+ const __m256i d1 =
+ WienerVerticalTap3Kernel(wiener_buffer + x + 16, width, filter, a);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ x += 32;
+ } while (x < width);
+ }
+}
+
+inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
+ uint8_t* const dst) {
+ const __m256i a0 = LoadAligned32(wiener_buffer + 0);
+ const __m256i a1 = LoadAligned32(wiener_buffer + 16);
+ const __m256i b0 = _mm256_add_epi16(a0, _mm256_set1_epi16(8));
+ const __m256i b1 = _mm256_add_epi16(a1, _mm256_set1_epi16(8));
+ const __m256i c0 = _mm256_srai_epi16(b0, 4);
+ const __m256i c1 = _mm256_srai_epi16(b1, 4);
+ const __m256i d = _mm256_packus_epi16(c0, c1);
+ StoreUnaligned32(dst, d);
+}
+
+inline void WienerVerticalTap1(const int16_t* wiener_buffer,
+ const ptrdiff_t width, const int height,
+ uint8_t* dst, const ptrdiff_t dst_stride) {
+ for (int y = height >> 1; y > 0; --y) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
+ x += 32;
+ } while (x < width);
+ dst += 2 * dst_stride;
+ wiener_buffer += 2 * width;
+ }
+
+ if ((height & 1) != 0) {
+ ptrdiff_t x = 0;
+ do {
+ WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
+ x += 32;
+ } while (x < width);
+ }
+}
+
+void WienerFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int16_t* const number_leading_zero_coefficients =
+ restoration_info.wiener_info.number_leading_zero_coefficients;
+ const int number_rows_to_skip = std::max(
+ static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
+ 1);
+ const ptrdiff_t wiener_stride = Align(width, 32);
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
+ // The values are saturated to 13 bits before storing.
+ int16_t* wiener_buffer_horizontal =
+ wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
+
+ // horizontal filtering.
+ // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
+ const int height_horizontal =
+ height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const __m128i c =
+ LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ // In order to keep the horizontal pass intermediate values within 16 bits we
+ // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
+ __m128i c_horizontal =
+ _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
+ c_horizontal = _mm_packs_epi16(c_horizontal, c_horizontal);
+ const __m256i coefficients_horizontal = _mm256_broadcastd_epi32(c_horizontal);
+ if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
+ // The maximum over-reads happen here.
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ coefficients_horizontal, &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
+ }
+
+ // vertical filtering.
+ // Over-writes up to 15 values.
+ const int16_t* const filter_vertical =
+ restoration_info.wiener_info.filter[WienerInfo::kVertical];
+ auto* dst = static_cast<uint8_t*>(dest);
+ if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
+ // Because the top row of |source| is a duplicate of the second row, and the
+ // bottom row of |source| is a duplicate of its above row, we can duplicate
+ // the top and bottom row of |wiener_buffer| accordingly.
+ memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
+ sizeof(*wiener_buffer_horizontal) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
+ WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
+ filter_vertical, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
+ WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
+ height, filter_vertical + 1, dst, stride);
+ } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
+ WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
+ wiener_stride, height, filter_vertical + 2, dst, stride);
+ } else {
+ assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
+ WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
+ wiener_stride, height, dst, stride);
+ }
+}
+
+//------------------------------------------------------------------------------
+// SGR
+
+constexpr int kSumOffset = 24;
+
+// SIMD overreads the number of bytes in SIMD registers - (width % 16) - 2 *
+// padding pixels, where padding is 3 for Pass 1 and 2 for Pass 2. The number of
+// bytes in SIMD registers is 16 for SSE4.1 and 32 for AVX2.
+constexpr int kOverreadInBytesPass1_128 = 10;
+constexpr int kOverreadInBytesPass2_128 = 12;
+constexpr int kOverreadInBytesPass1_256 = kOverreadInBytesPass1_128 + 16;
+constexpr int kOverreadInBytesPass2_256 = kOverreadInBytesPass2_128 + 16;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned32x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+}
+
+inline void LoadAligned32x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned32x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32(src[0] + x);
+ dst[1] = LoadAligned32(src[1] + x);
+ dst[2] = LoadAligned32(src[2] + x);
+}
+
+inline void LoadAligned32x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3]) {
+ dst[0] = LoadAligned32Msan(src[0] + x, sizeof(**src) * (x + 16 - border));
+ dst[1] = LoadAligned32Msan(src[1] + x, sizeof(**src) * (x + 16 - border));
+ dst[2] = LoadAligned32Msan(src[2] + x, sizeof(**src) * (x + 16 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m256i dst[2][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned64x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[2][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m256i dst[3][2]) {
+ LoadAligned64(src[0] + x, dst[0]);
+ LoadAligned64(src[1] + x, dst[1]);
+ LoadAligned64(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned64x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m256i dst[3][2]) {
+ LoadAligned64Msan(src[0] + x, sizeof(**src) * (x + 16 - border), dst[0]);
+ LoadAligned64Msan(src[1] + x, sizeof(**src) * (x + 16 - border), dst[1]);
+ LoadAligned64Msan(src[2] + x, sizeof(**src) * (x + 16 - border), dst[2]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
+// functions. Some compilers may generate super inefficient code and the whole
+// decoder could be 15% slower.
+
+inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m256i VaddlHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi8(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(s0, s1);
+}
+
+inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(s0, s1);
+}
+
+inline __m256i VaddlHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(s0, s1);
+}
+
+inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
+ return _mm_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwLo8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m256i VaddwHi8(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi8(src1, _mm256_setzero_si256());
+ return _mm256_add_epi16(src0, s1);
+}
+
+inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_add_epi32(src0, s1);
+}
+
+inline __m256i VaddwHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_add_epi32(src0, s1);
+}
+
+inline __m256i VmullNLo8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m256i VmullNHi8(const __m256i src0, const int src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, _mm256_set1_epi32(src1));
+}
+
+inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullLo16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpacklo_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpacklo_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
+ const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
+ const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
+ return _mm_madd_epi16(s0, s1);
+}
+
+inline __m256i VmullHi16(const __m256i src0, const __m256i src1) {
+ const __m256i s0 = _mm256_unpackhi_epi16(src0, _mm256_setzero_si256());
+ const __m256i s1 = _mm256_unpackhi_epi16(src1, _mm256_setzero_si256());
+ return _mm256_madd_epi16(s0, s1);
+}
+
+inline __m256i VrshrS32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srai_epi32(sum, src1);
+}
+
+inline __m128i VrshrU32(const __m128i src0, const int src1) {
+ const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
+ return _mm_srli_epi32(sum, src1);
+}
+
+inline __m256i VrshrU32(const __m256i src0, const int src1) {
+ const __m256i sum =
+ _mm256_add_epi32(src0, _mm256_set1_epi32(1 << (src1 - 1)));
+ return _mm256_srli_epi32(sum, src1);
+}
+
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareLo8(const __m256i src) {
+ const __m256i s = _mm256_unpacklo_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
+}
+
+inline __m256i SquareHi8(const __m256i src) {
+ const __m256i s = _mm256_unpackhi_epi8(src, _mm256_setzero_si256());
+ return _mm256_mullo_epi16(s, s);
+}
+
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+}
+
+inline void Prepare3_8(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = _mm256_alignr_epi8(src[1], src[0], 0);
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 1);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 2);
+}
+
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare3_16(const __m256i src[2], __m256i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm256_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm256_alignr_epi8(src[1], src[0], 4);
+}
+
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
+}
+
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
+}
+
+inline void Prepare5_16(const __m256i src[2], __m256i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm256_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm256_alignr_epi8(src[1], src[0], 8);
+}
+
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m256i Sum3_16(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi16(src0, src1);
+ return _mm256_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m256i Sum3_16(const __m256i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline __m256i Sum3_32(const __m256i src0, const __m256i src1,
+ const __m256i src2) {
+ const __m256i sum = _mm256_add_epi32(src0, src1);
+ return _mm256_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline void Sum3_32(const __m256i src[3][2], __m256i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WLo16(const __m256i src[3]) {
+ const __m256i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m256i Sum3WHi16(const __m256i src[3]) {
+ const __m256i sum = VaddlHi8(src[0], src[1]);
+ return VaddwHi8(sum, src[2]);
+}
+
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m256i Sum3WLo32(const __m256i src[3]) {
+ const __m256i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
+}
+
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m256i Sum3WHi32(const __m256i src[3]) {
+ const __m256i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
+}
+
+inline __m128i Sum5_16(const __m128i src[5]) {
+ const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
+ const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return _mm_add_epi16(sum, src[4]);
+}
+
+inline __m256i Sum5_16(const __m256i src[5]) {
+ const __m256i sum01 = _mm256_add_epi16(src[0], src[1]);
+ const __m256i sum23 = _mm256_add_epi16(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return _mm256_add_epi16(sum, src[4]);
+}
+
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
+ const __m128i sum = _mm_add_epi32(sum01, sum23);
+ return _mm_add_epi32(sum, *src4);
+}
+
+inline __m256i Sum5_32(const __m256i* const src0, const __m256i* const src1,
+ const __m256i* const src2, const __m256i* const src3,
+ const __m256i* const src4) {
+ const __m256i sum01 = _mm256_add_epi32(*src0, *src1);
+ const __m256i sum23 = _mm256_add_epi32(*src2, *src3);
+ const __m256i sum = _mm256_add_epi32(sum01, sum23);
+ return _mm256_add_epi32(sum, *src4);
+}
+
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline void Sum5_32(const __m256i src[5][2], __m256i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
+ const __m128i sum01 = VaddlLo8(src[0], src[1]);
+ const __m128i sum23 = VaddlLo8(src[2], src[3]);
+ const __m128i sum = _mm_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WLo16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlLo8(src[0], src[1]);
+ const __m256i sum23 = VaddlLo8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwLo8(sum, src[4]);
+}
+
+inline __m256i Sum5WHi16(const __m256i src[5]) {
+ const __m256i sum01 = VaddlHi8(src[0], src[1]);
+ const __m256i sum23 = VaddlHi8(src[2], src[3]);
+ const __m256i sum = _mm256_add_epi16(sum01, sum23);
+ return VaddwHi8(sum, src[4]);
+}
+
+inline __m128i Sum3Horizontal(const __m128i src) {
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
+}
+
+inline void Sum3Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes, __m256i dst[2]) {
+ __m256i s[3];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
+}
+
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline void Sum3WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
+}
+
+inline __m128i Sum5Horizontal(const __m128i src) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
+}
+
+inline void Sum5Horizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const dst0, __m256i* const dst1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
+}
+
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+inline void Sum5WHorizontal(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ const __m256i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m256i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m256i sum0123_lo = _mm256_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m256i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m256i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m256i sum0123_hi = _mm256_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
+}
+
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m256i src[5], __m256i* const row_sq3,
+ __m256i* const row_sq5) {
+ const __m256i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm256_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
+ __m128i s[5];
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
+}
+
+inline void SumHorizontal(const uint8_t* const src,
+ const ptrdiff_t over_read_in_bytes,
+ __m256i* const row3_0, __m256i* const row3_1,
+ __m256i* const row5_0, __m256i* const row5_1) {
+ __m256i s[5];
+ s[0] = LoadUnaligned32Msan(src + 0, over_read_in_bytes + 0);
+ s[1] = LoadUnaligned32Msan(src + 1, over_read_in_bytes + 1);
+ s[2] = LoadUnaligned32Msan(src + 2, over_read_in_bytes + 2);
+ s[3] = LoadUnaligned32Msan(src + 3, over_read_in_bytes + 3);
+ s[4] = LoadUnaligned32Msan(src + 4, over_read_in_bytes + 4);
+ const __m256i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m256i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm256_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm256_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline void SumHorizontal(const __m256i src[2], __m256i* const row_sq3_0,
+ __m256i* const row_sq3_1, __m256i* const row_sq5_0,
+ __m256i* const row_sq5_1) {
+ __m256i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m256i Sum343Lo(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WLo16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343Hi(const __m256i ma3[3]) {
+ const __m256i sum = Sum3WHi16(ma3);
+ const __m256i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m256i Sum343WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m256i Sum343WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m256i Sum565Lo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m256i Sum565Hi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi16(src);
+ const __m256i sum4 = _mm256_slli_epi16(sum, 2);
+ const __m256i sum5 = _mm256_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m256i Sum565WLo(const __m256i src[3]) {
+ const __m256i sum = Sum3WLo32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m256i Sum565WHi(const __m256i src[3]) {
+ const __m256i sum = Sum3WHi32(src);
+ const __m256i sum4 = _mm256_slli_epi32(sum, 2);
+ const __m256i sum5 = _mm256_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m256i src[2], __m256i dst[2]) {
+ __m256i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src, kOverreadInBytesPass1_128 - width);
+ __m128i sq_128[2], s3, s5, sq3[2], sq5[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s0);
+ sq_128[1] = SquareHi8(s0);
+ SumHorizontalLo(s0, &s3, &s5);
+ StoreAligned16(sum3, s3);
+ StoreAligned16(sum5, s5);
+ SumHorizontal(sq_128, &sq3[0], &sq3[1], &sq5[0], &sq5[1]);
+ StoreAligned32U32(square_sum3, sq3);
+ StoreAligned32U32(square_sum5, sq5);
+ src += 8;
+ sum3 += 8;
+ sum5 += 8;
+ square_sum3 += 8;
+ square_sum5 += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytesPass1_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, sum_width - x + 8 + kOverreadInBytesPass1_256 - width,
+ &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned64(sum3, row3);
+ StoreAligned64(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 0, row_sq3);
+ StoreAligned64(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned64(square_sum3 + 16, row_sq3);
+ StoreAligned64(square_sum5 + 16, row_sq5);
+ sq[0] = sq[2];
+ src += 32;
+ sum3 += 32;
+ sum5 += 32;
+ square_sum3 += 32;
+ square_sum5 += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sum3 += sum_stride - sum_width - 8;
+ sum5 += sum_stride - sum_width - 8;
+ square_sum3 += sum_stride - sum_width - 8;
+ square_sum5 += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int size>
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
+ static_assert(size == 3 || size == 5, "");
+ int kOverreadInBytes_128, kOverreadInBytes_256;
+ if (size == 3) {
+ kOverreadInBytes_128 = kOverreadInBytesPass2_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass2_256;
+ } else {
+ kOverreadInBytes_128 = kOverreadInBytesPass1_128;
+ kOverreadInBytes_256 = kOverreadInBytesPass1_256;
+ }
+ int y = 2;
+ do {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytes_128 - width);
+ __m128i ss, sq_128[2], sqs[2];
+ __m256i sq[3];
+ sq_128[0] = SquareLo8(s);
+ sq_128[1] = SquareHi8(s);
+ if (size == 3) {
+ ss = Sum3Horizontal(s);
+ Sum3WHorizontal(sq_128, sqs);
+ } else {
+ ss = Sum5Horizontal(s);
+ Sum5WHorizontal(sq_128, sqs);
+ }
+ StoreAligned16(sums, ss);
+ StoreAligned32U32(square_sums, sqs);
+ src += 8;
+ sums += 8;
+ square_sums += 8;
+ sq[0] = SetrM128i(sq_128[1], sq_128[1]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m256i row[2], row_sq[4];
+ const __m256i s = LoadUnaligned32Msan(
+ src + 8, sum_width - x + 16 + kOverreadInBytes_256 - width);
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ if (size == 3) {
+ Sum3Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal(src, sum_width - x + 8 + kOverreadInBytes_256 - width,
+ &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned64(sums, row);
+ StoreAligned64(square_sums + 0, row_sq + 0);
+ StoreAligned64(square_sums + 16, row_sq + 2);
+ sq[0] = sq[2];
+ src += 32;
+ sums += 32;
+ square_sums += 32;
+ x -= 32;
+ } while (x != 0);
+ src += src_stride - sum_width - 8;
+ sums += sum_stride - sum_width - 8;
+ square_sums += sum_stride - sum_width - 8;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
+ const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m256i dxd = _mm256_madd_epi16(sum, sum);
+ // _mm256_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm256_mullo_epi32(sum_sq, _mm256_set1_epi32(n));
+ __m256i axn = _mm256_add_epi32(sum_sq, _mm256_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm256_add_epi32(axn, _mm256_slli_epi32(sum_sq, 4));
+ const __m256i sub = _mm256_sub_epi32(axn, dxd);
+ const __m256i p = _mm256_max_epi32(sub, _mm256_setzero_si256());
+ const __m256i pxs = _mm256_mullo_epi32(p, _mm256_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m256i CalculateMa(const __m256i sum, const __m256i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ const __m256i sum_lo = _mm256_unpacklo_epi16(sum, _mm256_setzero_si256());
+ const __m256i sum_hi = _mm256_unpackhi_epi16(sum, _mm256_setzero_si256());
+ const __m256i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m256i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm256_packus_epi32(z0, z1);
+}
+
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB5(const __m256i sum, const __m256i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m256i m =
+ _mm256_maddubs_epi16(ma, _mm256_set1_epi16(one_over_n_quarter));
+ const __m256i m0 = VmullLo16(m, sum);
+ const __m256i m1 = VmullHi16(m, sum);
+ const __m256i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m256i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
+}
+
+inline __m256i CalculateB3(const __m256i sum, const __m256i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m256i m0 = VmullLo16(ma, sum);
+ const __m256i m1 = VmullHi16(ma, sum);
+ const __m256i m2 = _mm256_mullo_epi32(m0, _mm256_set1_epi32(one_over_n));
+ const __m256i m3 = _mm256_mullo_epi32(m1, _mm256_set1_epi32(one_over_n));
+ const __m256i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m256i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm256_packus_epi32(b_lo, b_hi);
+}
+
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex5(const __m256i s5[5], const __m256i sq5[5][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+inline void CalculateSumAndIndex3(const __m256i s3[3], const __m256i sq3[3][2],
+ const uint32_t scale, __m256i* const sum,
+ __m256i* const index) {
+ __m256i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
+}
+
+template <int n>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
+}
+
+// Repeat the first 48 elements in kSgrMaLookup with a period of 16.
+alignas(32) constexpr uint8_t kSgrMaLookupAvx2[96] = {
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5,
+ 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5};
+
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m256i ShuffleIndex(const __m256i table, const __m256i index) {
+ __m256i mask;
+ mask = _mm256_cmpgt_epi8(index, _mm256_set1_epi8(15));
+ mask = _mm256_or_si256(mask, index);
+ return _mm256_shuffle_epi8(table, mask);
+}
+
+inline __m256i AdjustValue(const __m256i value, const __m256i index,
+ const int threshold) {
+ const __m256i thresholds = _mm256_set1_epi8(threshold - 128);
+ const __m256i offset = _mm256_cmpgt_epi8(index, thresholds);
+ return _mm256_add_epi8(value, offset);
+}
+
+template <int n>
+inline void CalculateIntermediate(const __m256i sum[2], const __m256i index[2],
+ __m256i ma[3], __m256i b[2]) {
+ static_assert(n == 9 || n == 25, "");
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m256i c0 = LoadAligned32(kSgrMaLookupAvx2 + 0 * 32);
+ const __m256i c1 = LoadAligned32(kSgrMaLookupAvx2 + 1 * 32);
+ const __m256i c2 = LoadAligned32(kSgrMaLookupAvx2 + 2 * 32);
+ const __m256i indices = _mm256_packus_epi16(index[0], index[1]);
+ __m256i idx, mas;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm256_min_epu8(indices, _mm256_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ mas = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ mas = _mm256_or_si256(mas, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm256_sub_epi8(idx, _mm256_set1_epi8(16));
+ const __m256i res2 = ShuffleIndex(c2, idx);
+ mas = _mm256_or_si256(mas, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm256_add_epi8(indices, _mm256_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ mas = _mm256_max_epu8(mas, _mm256_set1_epi8(5));
+ mas = AdjustValue(mas, idx, 55); // 55 is the last index which value is 5.
+ mas = AdjustValue(mas, idx, 72); // 72 is the last index which value is 4.
+ mas = AdjustValue(mas, idx, 101); // 101 is the last index which value is 3.
+ mas = AdjustValue(mas, idx, 169); // 169 is the last index which value is 2.
+ mas = AdjustValue(mas, idx, 254); // 254 is the last index which value is 1.
+
+ ma[2] = _mm256_permute4x64_epi64(mas, 0x93); // 32-39 8-15 16-23 24-31
+ ma[0] = _mm256_blend_epi32(ma[0], ma[2], 0xfc); // 0-7 8-15 16-23 24-31
+ ma[1] = _mm256_permute2x128_si256(ma[0], ma[2], 0x21);
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m256i maq0 = _mm256_unpackhi_epi8(ma[0], _mm256_setzero_si256());
+ const __m256i maq1 = _mm256_unpacklo_epi8(ma[1], _mm256_setzero_si256());
+ if (n == 9) {
+ b[0] = CalculateB3(sum[0], maq0);
+ b[1] = CalculateB3(sum[1], maq1);
+ } else {
+ b[0] = CalculateB5(sum[0], maq0);
+ b[1] = CalculateB5(sum[1], maq1);
+ }
+}
+
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m256i b3[2], const ptrdiff_t x,
+ __m256i sum_b343[2], __m256i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m256i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm256_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm256_slli_epi32(sum_b111[1], 2);
+ StoreAligned64(b444 + x, sum_b444);
+ sum_b343[0] = _mm256_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm256_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned64(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i* const sum_ma444, __m256i sum_b343[2],
+ __m256i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m256i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm256_slli_epi16(sum_ma111, 2);
+ StoreAligned32(ma444 + x, *sum_ma444);
+ const __m256i sum333 = _mm256_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned32(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, __m256i* const sum_ma343,
+ __m256i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m256i ma3[3], const __m256i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m256i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][3], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t sum_width,
+ const ptrdiff_t x, const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m256i sq[2][3], __m256i ma[3],
+ __m256i b[3]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ Sum5Horizontal(src0, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ Sum5Horizontal(src1, over_read_in_bytes, &s5[0][4], &s5[1][4]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint32_t scale,
+ const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
+ __m256i sq[3], __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s5[2][5], sq5[5][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum5Horizontal(src, over_read_in_bytes, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scale, &sum[0], &index[0]);
+
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scale, &sum[1], &index[1]);
+ CalculateIntermediate<25>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t x, const ptrdiff_t sum_width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3], __m256i sq[3],
+ __m256i ma[3], __m256i b[3]) {
+ const __m256i s = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[4], sq3[3][2], sum[2], index[2];
+ sq[1] = SquareLo8(s);
+ sq[2] = SquareHi8(s);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ Sum3Horizontal(src, over_read_in_bytes, s3 + 2);
+ StoreAligned64(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 0, sq3[2]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ LoadAligned32x2U16(sum3, x, s3);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3 + 1);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate<9>(sum, index, ma, b + 1);
+ b[0] = _mm256_permute2x128_si256(b[0], b[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][2], __m128i ma3[2],
+ __m128i b3[2], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0]);
+ sq[1][1] = SquareHi8(s[1]);
+ SumHorizontalLo(s[0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ // Note: in the SSE4_1 version, CalculateIntermediate() is called
+ // to replace the slow LookupIntermediate() when calculating 16 intermediate
+ // data points. However, the AVX2 compiler generates even slower code. So we
+ // keep using CalculateIntermediate3().
+ CalculateIntermediate3(s3 + 0, sq3 + 0, scales[1], &ma3[0], &b3[0]);
+ CalculateIntermediate3(s3 + 1, sq3 + 1, scales[1], &ma3[1], &b3[1]);
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1,
+ const ptrdiff_t over_read_in_bytes, const ptrdiff_t x,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m256i sq[2][3], __m256i ma3[2][3],
+ __m256i b3[2][5], __m256i ma5[3], __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src0 + 8, over_read_in_bytes + 8);
+ const __m256i s1 = LoadUnaligned32Msan(src1 + 8, over_read_in_bytes + 8);
+ __m256i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2][2], index_3[2][2],
+ sum_5[2], index_5[2];
+ sq[0][1] = SquareLo8(s0);
+ sq[0][2] = SquareHi8(s0);
+ sq[1][1] = SquareLo8(s1);
+ sq[1][2] = SquareHi8(s1);
+ sq[0][0] = _mm256_permute2x128_si256(sq[0][0], sq[0][2], 0x21);
+ sq[1][0] = _mm256_permute2x128_si256(sq[1][0], sq[1][2], 0x21);
+ SumHorizontal(src0, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(src1, over_read_in_bytes, &s3[0][3], &s3[1][3], &s5[0][4],
+ &s5[1][4]);
+ StoreAligned32(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned32(sum3[2] + x + 16, s3[1][2]);
+ StoreAligned32(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned32(sum3[3] + x + 16, s3[1][3]);
+ StoreAligned32(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned32(sum5[3] + x + 16, s5[1][3]);
+ StoreAligned32(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned32(sum5[4] + x + 16, s5[1][4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x, sq3[2]);
+ StoreAligned64(square_sum5[3] + x, sq5[3]);
+ StoreAligned64(square_sum3[3] + x, sq3[3]);
+ StoreAligned64(square_sum5[4] + x, sq5[4]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0][0], &index_3[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum_3[1][0],
+ &index_3[1][0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned64(square_sum3[2] + x + 16, sq3[2]);
+ StoreAligned64(square_sum5[3] + x + 16, sq5[3]);
+ StoreAligned64(square_sum3[3] + x + 16, sq3[3]);
+ StoreAligned64(square_sum5[4] + x + 16, sq5[4]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[0][1], &index_3[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum_3[1][1],
+ &index_3[1][1]);
+ CalculateIntermediate<9>(sum_3[0], index_3[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate<9>(sum_3[1], index_3[1], ma3[1], b3[1] + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0][0] = _mm256_permute2x128_si256(b3[0][0], b3[0][2], 0x21);
+ b3[1][0] = _mm256_permute2x128_si256(b3[1][0], b3[1][2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const uint8_t* const src, const ptrdiff_t over_read_in_bytes,
+ const ptrdiff_t sum_width, const ptrdiff_t x, const uint16_t scales[2],
+ const uint16_t* const sum3[4], const uint16_t* const sum5[5],
+ const uint32_t* const square_sum3[4], const uint32_t* const square_sum5[5],
+ __m256i sq[6], __m256i ma3[2], __m256i ma5[2], __m256i b3[5],
+ __m256i b5[5]) {
+ const __m256i s0 = LoadUnaligned32Msan(src + 8, over_read_in_bytes + 8);
+ __m256i s3[2][3], s5[2][5], sq3[4][2], sq5[5][2], sum_3[2], index_3[2],
+ sum_5[2], index_5[2];
+ sq[1] = SquareLo8(s0);
+ sq[2] = SquareHi8(s0);
+ sq[0] = _mm256_permute2x128_si256(sq[0], sq[2], 0x21);
+ SumHorizontal(src, over_read_in_bytes, &s3[0][2], &s3[1][2], &s5[0][3],
+ &s5[1][3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16(sum3, x, s3[0]);
+ LoadAligned64x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum_3[0], &index_3[0]);
+ LoadAligned32x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned64x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[0], sq5, scales[0], &sum_5[0], &index_5[0]);
+
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned32x2U16Msan(sum3, x + 16, sum_width, s3[1]);
+ LoadAligned64x2U32Msan(square_sum3, x + 16, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum_3[1], &index_3[1]);
+ CalculateIntermediate<9>(sum_3, index_3, ma3, b3 + 1);
+ LoadAligned32x3U16Msan(sum5, x + 16, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned64x3U32Msan(square_sum5, x + 16, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateSumAndIndex5(s5[1], sq5, scales[0], &sum_5[1], &index_5[1]);
+ CalculateIntermediate<25>(sum_5, index_5, ma5, b5 + 1);
+ b3[0] = _mm256_permute2x128_si256(b3[0], b3[2], 0x21);
+ b5[0] = _mm256_permute2x128_si256(b5[0], b5[2], 0x21);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma5[3], ma[2], b[4];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned64(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64(b565, b + 0);
+ StoreAligned64(b565 + 16, b + 2);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ const __m128i s = LoadUnaligned16Msan(src, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, sq_128[2], b0;
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s);
+ BoxFilterPreProcess3Lo(s, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma3[3];
+ BoxFilterPreProcess3(src + x + 8, x + 8 + kOverreadInBytesPass2_256 - width,
+ x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ Prepare3_8(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 16, ma343, ma444, b343, b444);
+ ma444 += 32;
+ b444 += 32;
+ } else {
+ __m256i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned64(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64(b343 + 0, b + 0);
+ StoreAligned64(b343 + 16, b + 2);
+ }
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma343 += 32;
+ b343 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[2], b[4], ma3x[3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned64(ma343[0] + x, ma);
+ Sum343W(b3[0], b);
+ StoreAligned64(b343[0] + x, b);
+ Sum565W(b5, b);
+ StoreAligned64(b565, b);
+ Prepare3_8(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 16, ma343[1], ma444, b343[1], b444);
+ Prepare3_8(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned64(ma565, ma);
+ Sum343W(b3[0] + 1, b);
+ StoreAligned64(b343[0] + x + 16, b);
+ Sum565W(b5 + 1, b);
+ StoreAligned64(b565 + 16, b);
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+template <int shift>
+inline __m256i FilterOutput(const __m256i ma_x_src, const __m256i b) {
+ // ma: 255 * 32 = 8160 (13 bits)
+ // b: 65088 * 32 = 2082816 (21 bits)
+ // v: b - ma * 255 (22 bits)
+ const __m256i v = _mm256_sub_epi32(b, ma_x_src);
+ // kSgrProjSgrBits = 8
+ // kSgrProjRestoreBits = 4
+ // shift = 4 or 5
+ // v >> 8 or 9 (13 bits)
+ return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
+}
+
+template <int shift>
+inline __m256i CalculateFilteredOutput(const __m256i src, const __m256i ma,
+ const __m256i b[2]) {
+ const __m256i ma_x_src_lo = VmullLo16(ma, src);
+ const __m256i ma_x_src_hi = VmullHi16(ma, src);
+ const __m256i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
+ const __m256i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
+ return _mm256_packs_epi32(dst_lo, dst_hi); // 13 bits
+}
+
+inline __m256i CalculateFilteredOutputPass1(const __m256i src,
+ const __m256i ma[2],
+ const __m256i b[2][2]) {
+ const __m256i ma_sum = _mm256_add_epi16(ma[0], ma[1]);
+ __m256i b_sum[2];
+ b_sum[0] = _mm256_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm256_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i CalculateFilteredOutputPass2(const __m256i src,
+ const __m256i ma[3],
+ const __m256i b[3][2]) {
+ const __m256i ma_sum = Sum3_16(ma);
+ __m256i b_sum[2];
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
+}
+
+inline __m256i SelfGuidedFinal(const __m256i src, const __m256i v[2]) {
+ const __m256i v_lo =
+ VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i v_hi =
+ VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
+ const __m256i vv = _mm256_packs_epi32(v_lo, v_hi);
+ return _mm256_add_epi16(src, vv);
+}
+
+inline __m256i SelfGuidedDoubleMultiplier(const __m256i src,
+ const __m256i filter[2], const int w0,
+ const int w2) {
+ __m256i v[2];
+ const __m256i w0_w2 =
+ _mm256_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
+ const __m256i f_lo = _mm256_unpacklo_epi16(filter[0], filter[1]);
+ const __m256i f_hi = _mm256_unpackhi_epi16(filter[0], filter[1]);
+ v[0] = _mm256_madd_epi16(w0_w2, f_lo);
+ v[1] = _mm256_madd_epi16(w0_w2, f_hi);
+ return SelfGuidedFinal(src, v);
+}
+
+inline __m256i SelfGuidedSingleMultiplier(const __m256i src,
+ const __m256i filter, const int w0) {
+ // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
+ __m256i v[2];
+ v[0] = VmullNLo8(filter, w0);
+ v[1] = VmullNHi8(filter, w0);
+ return SelfGuidedFinal(src, v);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i ma0, b0, s[2][3], sq_128[2][2];
+ __m256i mas[3], sq[2][3], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0][0]);
+ sq_128[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq_128, &ma0, &b0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2][2];
+ BoxFilterPreProcess5(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ StoreAligned64(ma565[1] + x, ma + 1);
+ Sum565W(bs + 0, b[0][1]);
+ Sum565W(bs + 1, b[1][1]);
+ StoreAligned64(b565[1] + x + 0, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[1][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ const __m256i p00 = CalculateFilteredOutputPass1(sr0_lo, ma, b[0]);
+ const __m256i p01 = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[0][1]);
+ const __m256i d00 = SelfGuidedSingleMultiplier(sr0_lo, p00, w0);
+ const __m256i d10 = SelfGuidedSingleMultiplier(sr1_lo, p01, w0);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ ma[1] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[1][0]);
+ const __m256i p10 = CalculateFilteredOutputPass1(sr0_hi, ma + 1, b[1]);
+ const __m256i p11 = CalculateFilteredOutput<4>(sr1_hi, ma[2], b[1][1]);
+ const __m256i d01 = SelfGuidedSingleMultiplier(sr0_hi, p10, w0);
+ const __m256i d11 = SelfGuidedSingleMultiplier(sr1_hi, p11, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess5LastRowLo(s0, scale, sum5, square_sum5, sq_128, &ma0,
+ &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], ma5[3], b[2][2];
+ BoxFilterPreProcess5LastRow(
+ src0 + x + 8, x + 8 + kOverreadInBytesPass1_256 - width, sum_width,
+ x + 8, scale, sum5, square_sum5, sq, mas, bs);
+ Prepare3_8(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ ma[2] = Sum565Hi(ma5);
+ Sum565W(bs + 0, b[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565);
+ LoadAligned64(b565 + 0, b[0]);
+ const __m256i p0 = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma565 + 16);
+ LoadAligned64(b565 + 16, b[0]);
+ Sum565W(bs + 1, b[1]);
+ const __m256i p1 = CalculateFilteredOutputPass1(sr_hi, ma + 1, b);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ ma565 += 32;
+ b565 += 32;
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass2_128 - width);
+ __m128i ma0, b0, sq_128[2];
+ __m256i mas[3], sq[3], bs[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcess3Lo(s0, scale, sum3, square_sum3, sq_128, &ma0, &b0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ mas[0] = SetrM128i(ma0, ma0);
+ bs[0] = SetrM128i(b0, b0);
+
+ int x = 0;
+ do {
+ __m256i ma[4], b[4][2], ma3[3];
+ BoxFilterPreProcess3(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass2_256 - width, x + 8,
+ sum_width, scale, sum3, square_sum3, sq, mas, bs);
+ Prepare3_8(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x + 0, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ Store343_444Hi(ma3, bs + 1, x + 16, &ma[3], b[3], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma343[0] + x);
+ ma[1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[0]);
+ LoadAligned64(b444[0] + x, b[1]);
+ const __m256i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ ma[1] = LoadAligned32(ma343[0] + x + 16);
+ ma[2] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1]);
+ LoadAligned64(b444[0] + x + 16, b[2]);
+ const __m256i p1 = CalculateFilteredOutputPass2(sr_hi, ma + 1, b + 1);
+ const __m256i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m256i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ mas[0] = mas[2];
+ bs[0] = bs[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2], ma3_128[2], ma5_0, sq_128[2][2], b3_128[2], b5_0;
+ __m256i ma3[2][3], ma5[3], sq[2][3], b3[2][5], b5[5];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ s[1] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1_128 - width);
+ sq_128[0][0] = SquareLo8(s[0]);
+ sq_128[1][0] = SquareLo8(s[1]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq_128,
+ ma3_128, b3_128, &ma5_0, &b5_0);
+ sq[0][0] = SetrM128i(sq_128[0][0], sq_128[0][1]);
+ sq[1][0] = SetrM128i(sq_128[1][0], sq_128[1][1]);
+ ma3[0][0] = SetrM128i(ma3_128[0], ma3_128[0]);
+ ma3[1][0] = SetrM128i(ma3_128[1], ma3_128[1]);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0][0] = SetrM128i(b3_128[0], b3_128[0]);
+ b3[1][0] = SetrM128i(b3_128[1], b3_128[1]);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3][3], mat[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ BoxFilterPreProcess(src0 + x + 8, src1 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width, x + 8,
+ scales, sum3, sum5, square_sum3, square_sum5, sum_width,
+ sq, ma3, b3, ma5, b5);
+ Prepare3_8(ma3[0], ma3x[0]);
+ Prepare3_8(ma3[1], ma3x[1]);
+ Prepare3_8(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ ma[0][2] = Sum565Hi(ma5x);
+ mat[0][1] = ma[0][2];
+ StoreAligned64(ma565[1] + x, ma[0] + 1);
+ Sum565W(b5, b[0][1]);
+ StoreAligned64(b565[1] + x, b[0][1]);
+ const __m256i sr0 = LoadUnaligned32(src + x);
+ const __m256i sr1 = LoadUnaligned32(src + stride + x);
+ const __m256i sr0_lo = _mm256_unpacklo_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_lo = _mm256_unpacklo_epi8(sr1, _mm256_setzero_si256());
+ ma[0][0] = LoadAligned32(ma565[0] + x);
+ LoadAligned64(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned32(ma343[0] + x);
+ ma[1][1] = LoadAligned32(ma444[0] + x);
+ LoadAligned64(b343[0] + x, b[1][0]);
+ LoadAligned64(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m256i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned32(ma343[1] + x);
+ LoadAligned64(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m256i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned64(b565[1] + x + 16, b[0][1]);
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 16, &mat[1][2], &mat[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 16, &mat[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ const __m256i sr0_hi = _mm256_unpackhi_epi8(sr0, _mm256_setzero_si256());
+ const __m256i sr1_hi = _mm256_unpackhi_epi8(sr1, _mm256_setzero_si256());
+ mat[0][0] = LoadAligned32(ma565[0] + x + 16);
+ LoadAligned64(b565[0] + x + 16, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, mat[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, mat[0][1], b[0][1]);
+ mat[1][0] = LoadAligned32(ma343[0] + x + 16);
+ mat[1][1] = LoadAligned32(ma444[0] + x + 16);
+ LoadAligned64(b343[0] + x + 16, b[1][0]);
+ LoadAligned64(b444[0] + x + 16, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, mat[1], b[1]);
+ const __m256i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ mat[2][0] = LoadAligned32(ma343[1] + x + 16);
+ LoadAligned64(b343[1] + x + 16, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, mat[2], b[2]);
+ const __m256i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d00, d01));
+ StoreUnaligned32(dst + stride + x, _mm256_packus_epi16(d10, d11));
+ sq[0][0] = sq[0][2];
+ sq[1][0] = sq[1][2];
+ ma3[0][0] = ma3[0][2];
+ ma3[1][0] = ma3[1][2];
+ ma5[0] = ma5[2];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ const __m128i s0 =
+ LoadUnaligned16Msan(src0, kOverreadInBytesPass1_128 - width);
+ __m128i ma3_0, ma5_0, b3_0, b5_0, sq_128[2];
+ __m256i ma3[3], ma5[3], sq[3], b3[3], b5[3];
+ sq_128[0] = SquareLo8(s0);
+ BoxFilterPreProcessLastRowLo(s0, scales, sum3, sum5, square_sum3, square_sum5,
+ sq_128, &ma3_0, &ma5_0, &b3_0, &b5_0);
+ sq[0] = SetrM128i(sq_128[0], sq_128[1]);
+ ma3[0] = SetrM128i(ma3_0, ma3_0);
+ ma5[0] = SetrM128i(ma5_0, ma5_0);
+ b3[0] = SetrM128i(b3_0, b3_0);
+ b5[0] = SetrM128i(b5_0, b5_0);
+
+ int x = 0;
+ do {
+ __m256i ma[3], mat[3], b[3][2], p[2], ma3x[3], ma5x[3];
+ BoxFilterPreProcessLastRow(src0 + x + 8,
+ x + 8 + kOverreadInBytesPass1_256 - width,
+ sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8(ma3, ma3x);
+ Prepare3_8(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m256i sr = LoadUnaligned32(src + x);
+ const __m256i sr_lo = _mm256_unpacklo_epi8(sr, _mm256_setzero_si256());
+ ma[0] = LoadAligned32(ma565 + x);
+ LoadAligned64(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned32(ma343 + x);
+ ma[1] = LoadAligned32(ma444 + x);
+ LoadAligned64(b343 + x, b[0]);
+ LoadAligned64(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m256i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ mat[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ mat[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m256i sr_hi = _mm256_unpackhi_epi8(sr, _mm256_setzero_si256());
+ mat[0] = LoadAligned32(ma565 + x + 16);
+ LoadAligned64(b565 + x + 16, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, mat, b);
+ mat[0] = LoadAligned32(ma343 + x + 16);
+ mat[1] = LoadAligned32(ma444 + x + 16);
+ LoadAligned64(b343 + x + 16, b[0]);
+ LoadAligned64(b444 + x + 16, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, mat, b);
+ const __m256i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreUnaligned32(dst + x, _mm256_packus_epi16(d0, d1));
+ sq[0] = sq[2];
+ ma3[0] = ma3[2];
+ ma5[0] = ma5[2];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 32;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, temp_stride, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5 + kSumOffset;
+ square_sum5[0] = sgr_buffer->square_sum5 + kSumOffset;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
+ const auto temp_stride = Align<ptrdiff_t>(width, 32);
+ const auto sum_width = temp_stride + 8;
+ const auto sum_stride = temp_stride + 32;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3 + kSumOffset;
+ square_sum3[0] = sgr_buffer->square_sum3 + kSumOffset;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, temp_stride,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
+
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
+}
+
+// If |width| is non-multiple of 32, up to 31 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
+// part of the visible frame.
+void SelfGuidedFilter_AVX2(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
+ const int index = restoration_info.sgr_proj_info.index;
+ const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
+ const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
+ if (radius_pass_1 == 0) {
+ // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
+ // following assertion.
+ assert(radius_pass_0 != 0);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else if (radius_pass_0 == 0) {
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
+ } else {
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
+ }
+}
+
+void Init8bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_8BPP_AVX2(WienerFilter)
+ dsp->loop_restorations[0] = WienerFilter_AVX2;
+#endif
+#if DSP_ENABLED_8BPP_AVX2(SelfGuidedFilter)
+ dsp->loop_restorations[1] = SelfGuidedFilter_AVX2;
+#endif
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+void LoopRestorationInit_AVX2() { low_bitdepth::Init8bpp(); }
+
+} // namespace dsp
+} // namespace libgav1
+
+#else // !LIBGAV1_TARGETING_AVX2
+namespace libgav1 {
+namespace dsp {
+
+void LoopRestorationInit_AVX2() {}
+
+} // namespace dsp
+} // namespace libgav1
+#endif // LIBGAV1_TARGETING_AVX2
diff --git a/libgav1/src/dsp/x86/loop_restoration_avx2.h b/libgav1/src/dsp/x86/loop_restoration_avx2.h
new file mode 100644
index 0000000..2c3534a
--- /dev/null
+++ b/libgav1/src/dsp/x86/loop_restoration_avx2.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 The libgav1 Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+#define LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
+
+#include "src/dsp/dsp.h"
+#include "src/utils/cpu.h"
+
+namespace libgav1 {
+namespace dsp {
+
+// Initializes Dsp::loop_restorations, see the defines below for specifics.
+// These functions are not thread-safe.
+void LoopRestorationInit_AVX2();
+void LoopRestorationInit10bpp_AVX2();
+
+} // namespace dsp
+} // namespace libgav1
+
+// If avx2 is enabled and the baseline isn't set due to a higher level of
+// optimization being enabled, signal the avx2 implementation should be used.
+#if LIBGAV1_TARGETING_AVX2
+
+#ifndef LIBGAV1_Dsp8bpp_WienerFilter
+#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_AVX2
+#endif
+
+#endif // LIBGAV1_TARGETING_AVX2
+
+#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_AVX2_H_
diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.cc b/libgav1/src/dsp/x86/loop_restoration_sse4.cc
index 34f4ae8..273bcc8 100644
--- a/libgav1/src/dsp/x86/loop_restoration_sse4.cc
+++ b/libgav1/src/dsp/x86/loop_restoration_sse4.cc
@@ -15,9 +15,10 @@
#include "src/dsp/loop_restoration.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
+#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -35,194 +36,170 @@
namespace low_bitdepth {
namespace {
-inline void WienerHorizontalTap7Kernel(const __m128i s[2],
- const __m128i filter[4],
- int16_t* const wiener_buffer) {
- const int limit =
- (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
- const int offset =
+inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
+ int16_t* const wiener_buffer) {
+ constexpr int offset =
1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
+ constexpr int limit =
+ (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
const __m128i offsets = _mm_set1_epi16(-offset);
const __m128i limits = _mm_set1_epi16(limit - offset);
- const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
- const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
- const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
- const auto s45 = _mm_alignr_epi8(s[1], s[0], 9);
- const auto s67 = _mm_alignr_epi8(s[1], s[0], 13);
- const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
- const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
- const __m128i madd45 = _mm_maddubs_epi16(s45, filter[2]);
- const __m128i madd67 = _mm_maddubs_epi16(s67, filter[3]);
- const __m128i madd0123 = _mm_add_epi16(madd01, madd23);
- const __m128i madd4567 = _mm_add_epi16(madd45, madd67);
- // The sum range here is [-128 * 255, 90 * 255].
- const __m128i madd = _mm_add_epi16(madd0123, madd4567);
- const __m128i sum = _mm_add_epi16(madd, round);
+ // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
+ const __m128i sum = _mm_add_epi16(s[0], s[1]);
const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
- // Calculate scaled down offset correction, and add to sum here to prevent
- // signed 16 bit outranging.
- const __m128i s_3x128 =
- _mm_slli_epi16(_mm_srli_epi16(s23, 8), 7 - kInterRoundBitsHorizontal);
+ // Add back scaled down offset correction.
const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
const __m128i d1 = _mm_min_epi16(d0, limits);
StoreAligned16(wiener_buffer, d1);
}
-inline void WienerHorizontalTap5Kernel(const __m128i s[2],
+inline void WienerHorizontalTap7Kernel(const __m128i s[4],
+ const __m128i filter[4],
+ int16_t* const wiener_buffer) {
+ __m128i madds[4];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
+ madds[1] = _mm_add_epi16(madds[1], madds[3]);
+ const __m128i s_3x128 =
+ _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
+}
+
+inline void WienerHorizontalTap5Kernel(const __m128i s[5],
const __m128i filter[3],
int16_t* const wiener_buffer) {
- const int limit =
- (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
- const int offset =
- 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
- const __m128i offsets = _mm_set1_epi16(-offset);
- const __m128i limits = _mm_set1_epi16(limit - offset);
- const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
- const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
- const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
- const auto s45 = _mm_alignr_epi8(s[1], s[0], 9);
- const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
- const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
- const __m128i madd45 = _mm_maddubs_epi16(s45, filter[2]);
- const __m128i madd0123 = _mm_add_epi16(madd01, madd23);
- // The sum range here is [-128 * 255, 90 * 255].
- const __m128i madd = _mm_add_epi16(madd0123, madd45);
- const __m128i sum = _mm_add_epi16(madd, round);
- const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
- // Calculate scaled down offset correction, and add to sum here to prevent
- // signed 16 bit outranging.
+ __m128i madds[3];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
+ madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
+ madds[0] = _mm_add_epi16(madds[0], madds[2]);
const __m128i s_3x128 =
- _mm_srli_epi16(_mm_slli_epi16(s23, 8), kInterRoundBitsHorizontal + 1);
- const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
- const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
- const __m128i d1 = _mm_min_epi16(d0, limits);
- StoreAligned16(wiener_buffer, d1);
+ _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
}
inline void WienerHorizontalTap3Kernel(const __m128i s[2],
const __m128i filter[2],
int16_t* const wiener_buffer) {
- const int limit =
- (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
- const int offset =
- 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
- const __m128i offsets = _mm_set1_epi16(-offset);
- const __m128i limits = _mm_set1_epi16(limit - offset);
- const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsHorizontal - 1));
- const auto s01 = _mm_alignr_epi8(s[1], s[0], 1);
- const auto s23 = _mm_alignr_epi8(s[1], s[0], 5);
- const __m128i madd01 = _mm_maddubs_epi16(s01, filter[0]);
- const __m128i madd23 = _mm_maddubs_epi16(s23, filter[1]);
- // The sum range here is [-128 * 255, 90 * 255].
- const __m128i madd = _mm_add_epi16(madd01, madd23);
- const __m128i sum = _mm_add_epi16(madd, round);
- const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
- // Calculate scaled down offset correction, and add to sum here to prevent
- // signed 16 bit outranging.
+ __m128i madds[2];
+ madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
+ madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
const __m128i s_3x128 =
- _mm_slli_epi16(_mm_srli_epi16(s01, 8), 7 - kInterRoundBitsHorizontal);
- const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
- const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
- const __m128i d1 = _mm_min_epi16(d0, limits);
- StoreAligned16(wiener_buffer, d1);
+ _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
+ WienerHorizontalClip(madds, s_3x128, wiener_buffer);
}
+// loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
+ const int coefficient0,
const __m128i coefficients,
int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
__m128i filter[4];
filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
- filter[3] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8000));
- int y = height;
- do {
- const __m128i s0 = LoadUnaligned16(src);
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s0, s0);
- ss[1] = _mm_unpackhi_epi8(s0, s0);
+ filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
+ for (int y = height; y != 0; --y) {
ptrdiff_t x = 0;
do {
- const __m128i s1 = LoadUnaligned16(src + x + 16);
- ss[2] = _mm_unpacklo_epi8(s1, s1);
- ss[3] = _mm_unpackhi_epi8(s1, s1);
- WienerHorizontalTap7Kernel(ss + 0, filter, *wiener_buffer + x + 0);
- WienerHorizontalTap7Kernel(ss + 1, filter, *wiener_buffer + x + 8);
- ss[0] = ss[2];
- ss[1] = ss[3];
+ __m128i s[7], ss[4];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ s[5] = LoadUnaligned16(src + x + 5);
+ s[6] = LoadUnaligned16(src + x + 6);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ ss[3] = _mm_unpacklo_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ ss[3] = _mm_unpackhi_epi8(s[6], round);
+ WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
x += 16;
} while (x < width);
src += src_stride;
*wiener_buffer += width;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
+ const int coefficient1,
const __m128i coefficients,
int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
__m128i filter[3];
filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
- filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8002));
- int y = height;
- do {
- const __m128i s0 = LoadUnaligned16(src);
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s0, s0);
- ss[1] = _mm_unpackhi_epi8(s0, s0);
+ filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
+ for (int y = height; y != 0; --y) {
ptrdiff_t x = 0;
do {
- const __m128i s1 = LoadUnaligned16(src + x + 16);
- ss[2] = _mm_unpacklo_epi8(s1, s1);
- ss[3] = _mm_unpackhi_epi8(s1, s1);
- WienerHorizontalTap5Kernel(ss + 0, filter, *wiener_buffer + x + 0);
- WienerHorizontalTap5Kernel(ss + 1, filter, *wiener_buffer + x + 8);
- ss[0] = ss[2];
- ss[1] = ss[3];
+ __m128i s[5], ss[3];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ s[3] = LoadUnaligned16(src + x + 3);
+ s[4] = LoadUnaligned16(src + x + 4);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ ss[2] = _mm_unpacklo_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ ss[2] = _mm_unpackhi_epi8(s[4], round);
+ WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
x += 16;
} while (x < width);
src += src_stride;
*wiener_buffer += width;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
+ const int coefficient2,
const __m128i coefficients,
int16_t** const wiener_buffer) {
+ const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
__m128i filter[2];
filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
- filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x8004));
- int y = height;
- do {
- const __m128i s0 = LoadUnaligned16(src);
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s0, s0);
- ss[1] = _mm_unpackhi_epi8(s0, s0);
+ filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
+ for (int y = height; y != 0; --y) {
ptrdiff_t x = 0;
do {
- const __m128i s1 = LoadUnaligned16(src + x + 16);
- ss[2] = _mm_unpacklo_epi8(s1, s1);
- ss[3] = _mm_unpackhi_epi8(s1, s1);
- WienerHorizontalTap3Kernel(ss + 0, filter, *wiener_buffer + x + 0);
- WienerHorizontalTap3Kernel(ss + 1, filter, *wiener_buffer + x + 8);
- ss[0] = ss[2];
- ss[1] = ss[3];
+ __m128i s[3], ss[2];
+ s[0] = LoadUnaligned16(src + x + 0);
+ s[1] = LoadUnaligned16(src + x + 1);
+ s[2] = LoadUnaligned16(src + x + 2);
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ ss[1] = _mm_unpacklo_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
+ ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ ss[1] = _mm_unpackhi_epi8(s[2], round);
+ WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
x += 16;
} while (x < width);
src += src_stride;
*wiener_buffer += width;
- } while (--y != 0);
+ }
}
inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
const ptrdiff_t width, const int height,
int16_t** const wiener_buffer) {
- int y = height;
- do {
+ for (int y = height; y != 0; --y) {
ptrdiff_t x = 0;
do {
const __m128i s = LoadUnaligned16(src + x);
@@ -236,7 +213,7 @@
} while (x < width);
src += src_stride;
*wiener_buffer += width;
- } while (--y != 0);
+ }
}
inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
@@ -504,19 +481,19 @@
}
}
-void WienerFilter_SSE4_1(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- const ptrdiff_t source_stride,
- const ptrdiff_t dest_stride, const int width,
- const int height, RestorationBuffer* const buffer) {
- constexpr int kCenterTap = kWienerFilterTaps / 2;
+void WienerFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int16_t* const number_leading_zero_coefficients =
restoration_info.wiener_info.number_leading_zero_coefficients;
const int number_rows_to_skip = std::max(
static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
1);
const ptrdiff_t wiener_stride = Align(width, 16);
- int16_t* const wiener_buffer_vertical = buffer->wiener_buffer;
+ int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
// The values are saturated to 13 bits before storing.
int16_t* wiener_buffer_horizontal =
wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
@@ -525,31 +502,61 @@
// Over-reads up to 15 - |kRestorationHorizontalBorder| values.
const int height_horizontal =
height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
- const auto* const src = static_cast<const uint8_t*>(source) -
- (kCenterTap - number_rows_to_skip) * source_stride;
- const __m128i c =
- LoadLo8(restoration_info.wiener_info.filter[WienerInfo::kHorizontal]);
+ const int height_extra = (height_horizontal - height) >> 1;
+ assert(height_extra <= 2);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* const top = static_cast<const uint8_t*>(top_border);
+ const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
+ const int16_t* const filter_horizontal =
+ restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
+ const __m128i c = LoadLo8(filter_horizontal);
// In order to keep the horizontal pass intermediate values within 16 bits we
// offset |filter[3]| by 128. The 128 offset will be added back in the loop.
const __m128i coefficients_horizontal =
_mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
- WienerHorizontalTap7(src - 3, source_stride, wiener_stride,
- height_horizontal, coefficients_horizontal,
+ WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[0], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
+ filter_horizontal[0], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[0],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
- WienerHorizontalTap5(src - 2, source_stride, wiener_stride,
- height_horizontal, coefficients_horizontal,
+ WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[1], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
+ filter_horizontal[1], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[1],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
// The maximum over-reads happen here.
- WienerHorizontalTap3(src - 1, source_stride, wiener_stride,
- height_horizontal, coefficients_horizontal,
+ WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
+ top_border_stride, wiener_stride, height_extra,
+ filter_horizontal[2], coefficients_horizontal,
&wiener_buffer_horizontal);
+ WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
+ filter_horizontal[2], coefficients_horizontal,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
+ height_extra, filter_horizontal[2],
+ coefficients_horizontal, &wiener_buffer_horizontal);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
- WienerHorizontalTap1(src, source_stride, wiener_stride, height_horizontal,
+ WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
+ top_border_stride, wiener_stride, height_extra,
&wiener_buffer_horizontal);
+ WienerHorizontalTap1(src, stride, wiener_stride, height,
+ &wiener_buffer_horizontal);
+ WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
+ height_extra, &wiener_buffer_horizontal);
}
// vertical filtering.
@@ -563,27 +570,114 @@
// the top and bottom row of |wiener_buffer| accordingly.
memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
sizeof(*wiener_buffer_horizontal) * wiener_stride);
- memcpy(buffer->wiener_buffer, buffer->wiener_buffer + wiener_stride,
- sizeof(*buffer->wiener_buffer) * wiener_stride);
+ memcpy(restoration_buffer->wiener_buffer,
+ restoration_buffer->wiener_buffer + wiener_stride,
+ sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
- filter_vertical, dst, dest_stride);
+ filter_vertical, dst, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
- height, filter_vertical + 1, dst, dest_stride);
+ height, filter_vertical + 1, dst, stride);
} else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
- wiener_stride, height, filter_vertical + 2, dst,
- dest_stride);
+ wiener_stride, height, filter_vertical + 2, dst, stride);
} else {
assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
- wiener_stride, height, dst, dest_stride);
+ wiener_stride, height, dst, stride);
}
}
//------------------------------------------------------------------------------
// SGR
+// SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
+// Pass 1 and 2 for Pass 2.
+constexpr int kOverreadInBytesPass1 = 10;
+constexpr int kOverreadInBytesPass2 = 12;
+
+inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+}
+
+inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16(src[0] + x);
+ dst[1] = LoadAligned16(src[1] + x);
+ dst[2] = LoadAligned16(src[2] + x);
+}
+
+inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3]) {
+ dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
+ dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
+ dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
+}
+
+inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
+ dst[0] = LoadAligned16(src + 0);
+ dst[1] = LoadAligned16(src + 4);
+}
+
+inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
+ const ptrdiff_t border, __m128i dst[2]) {
+ dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
+ dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
+}
+
+inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
+ __m128i dst[2][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+}
+
+inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[2][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+}
+
+inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
+ __m128i dst[3][2]) {
+ LoadAligned32U32(src[0] + x, dst[0]);
+ LoadAligned32U32(src[1] + x, dst[1]);
+ LoadAligned32U32(src[2] + x, dst[2]);
+}
+
+inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
+ const ptrdiff_t x, const ptrdiff_t border,
+ __m128i dst[3][2]) {
+ LoadAligned32U32Msan(src[0], x, border, dst[0]);
+ LoadAligned32U32Msan(src[1], x, border, dst[1]);
+ LoadAligned32U32Msan(src[2], x, border, dst[2]);
+}
+
+inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 8, src[1]);
+}
+
+inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
+ StoreAligned16(dst + 0, src[0]);
+ StoreAligned16(dst + 4, src[1]);
+}
+
+inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
+ StoreAligned32U32(dst + 0, src + 0);
+ StoreAligned32U32(dst + 8, src + 2);
+}
+
// Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
// functions. Some compilers may generate super inefficient code and the whole
// decoder could be 15% slower.
@@ -632,24 +726,6 @@
return _mm_add_epi32(src0, s1);
}
-// Using VgetLane16() can save a sign extension instruction.
-template <int n>
-inline int VgetLane16(const __m128i src) {
- return _mm_extract_epi16(src, n);
-}
-
-inline __m128i VmullLo8(const __m128i src0, const __m128i src1) {
- const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
- const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
- return _mm_mullo_epi16(s0, s1);
-}
-
-inline __m128i VmullHi8(const __m128i src0, const __m128i src1) {
- const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
- const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
- return _mm_mullo_epi16(s0, s1);
-}
-
inline __m128i VmullNLo8(const __m128i src0, const int src1) {
const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
@@ -682,131 +758,97 @@
return _mm_srli_epi32(sum, src1);
}
-template <int n>
-inline __m128i CalcAxN(const __m128i a) {
- static_assert(n == 9 || n == 25, "");
- // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
- // Some compilers could do this for us but we make this explicit.
- // return _mm_mullo_epi32(a, _mm_set1_epi32(n));
- const __m128i ax9 = _mm_add_epi32(a, _mm_slli_epi32(a, 3));
- if (n == 9) return ax9;
- if (n == 25) return _mm_add_epi32(ax9, _mm_slli_epi32(a, 4));
+inline __m128i SquareLo8(const __m128i src) {
+ const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
}
-template <int n>
-inline __m128i CalculateMa(const __m128i sum_sq, const __m128i sum,
- const uint32_t s) {
- // a = |sum_sq|
- // d = |sum|
- // p = (a * n < d * d) ? 0 : a * n - d * d;
- const __m128i dxd = _mm_madd_epi16(sum, sum);
- const __m128i axn = CalcAxN<n>(sum_sq);
- const __m128i sub = _mm_sub_epi32(axn, dxd);
- const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
-
- // z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
- const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(s));
- return VrshrU32(pxs, kSgrProjScaleBits);
+inline __m128i SquareHi8(const __m128i src) {
+ const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
+ return _mm_mullo_epi16(s, s);
}
-// b = ma * b * one_over_n
-// |ma| = [0, 255]
-// |sum| is a box sum with radius 1 or 2.
-// For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
-// For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
-// |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
-// When radius is 2 |n| is 25. |one_over_n| is 164.
-// When radius is 1 |n| is 9. |one_over_n| is 455.
-// |kSgrProjReciprocalBits| is 12.
-// Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
-// Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
-inline __m128i CalculateIntermediate4(const __m128i ma, const __m128i sum,
- const uint32_t one_over_n) {
- const __m128i maq = _mm_unpacklo_epi8(ma, _mm_setzero_si128());
- const __m128i s = _mm_unpackhi_epi16(maq, _mm_setzero_si128());
- const __m128i m = _mm_madd_epi16(s, sum);
- const __m128i b = _mm_mullo_epi32(m, _mm_set1_epi32(one_over_n));
- const __m128i truncate_u32 = VrshrU32(b, kSgrProjReciprocalBits);
- return _mm_packus_epi32(truncate_u32, truncate_u32);
+inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
}
-inline __m128i CalculateIntermediate8(const __m128i ma, const __m128i sum,
- const uint32_t one_over_n) {
- const __m128i maq = _mm_unpackhi_epi8(ma, _mm_setzero_si128());
- const __m128i m0 = VmullLo16(maq, sum);
- const __m128i m1 = VmullHi16(maq, sum);
- const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
- const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
- const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
- const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
- return _mm_packus_epi32(b_lo, b_hi);
+template <int offset>
+inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
}
-inline __m128i Sum3_16(const __m128i left, const __m128i middle,
- const __m128i right) {
- const __m128i sum = _mm_add_epi16(left, middle);
- return _mm_add_epi16(sum, right);
+inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
+ dst[0] = src[0];
+ dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
}
-inline __m128i Sum3_32(const __m128i left, const __m128i middle,
- const __m128i right) {
- const __m128i sum = _mm_add_epi32(left, middle);
- return _mm_add_epi32(sum, right);
+inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
+ dst[0] = src;
+ dst[1] = _mm_srli_si128(src, 1);
+ dst[2] = _mm_srli_si128(src, 2);
+ dst[3] = _mm_srli_si128(src, 3);
+ dst[4] = _mm_srli_si128(src, 4);
}
-inline __m128i Sum3W_16(const __m128i left, const __m128i middle,
- const __m128i right) {
- const __m128i sum = VaddlLo8(left, middle);
- return VaddwLo8(sum, right);
+template <int offset>
+inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
+ dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
+ dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
+ dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
}
-inline __m128i Sum3WLo_16(const __m128i src[3]) {
- return Sum3W_16(src[0], src[1], src[2]);
+inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
+ Prepare3_16(src, dst);
+ dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
+ dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
}
-inline __m128i Sum3WHi_16(const __m128i src[3]) {
+inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi16(src0, src1);
+ return _mm_add_epi16(sum, src2);
+}
+
+inline __m128i Sum3_16(const __m128i src[3]) {
+ return Sum3_16(src[0], src[1], src[2]);
+}
+
+inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
+ const __m128i src2) {
+ const __m128i sum = _mm_add_epi32(src0, src1);
+ return _mm_add_epi32(sum, src2);
+}
+
+inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
+ dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
+ dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
+}
+
+inline __m128i Sum3WLo16(const __m128i src[3]) {
+ const __m128i sum = VaddlLo8(src[0], src[1]);
+ return VaddwLo8(sum, src[2]);
+}
+
+inline __m128i Sum3WHi16(const __m128i src[3]) {
const __m128i sum = VaddlHi8(src[0], src[1]);
return VaddwHi8(sum, src[2]);
}
-inline __m128i Sum3WLo_32(const __m128i left, const __m128i middle,
- const __m128i right) {
- const __m128i sum = VaddlLo16(left, middle);
- return VaddwLo16(sum, right);
+inline __m128i Sum3WLo32(const __m128i src[3]) {
+ const __m128i sum = VaddlLo16(src[0], src[1]);
+ return VaddwLo16(sum, src[2]);
}
-inline __m128i Sum3WHi_32(const __m128i left, const __m128i middle,
- const __m128i right) {
- const __m128i sum = VaddlHi16(left, middle);
- return VaddwHi16(sum, right);
-}
-
-inline __m128i* Sum3W_16x2(const __m128i src[3], __m128i sum[2]) {
- sum[0] = Sum3WLo_16(src);
- sum[1] = Sum3WHi_16(src);
- return sum;
-}
-
-inline __m128i* Sum3W(const __m128i src[3], __m128i sum[2]) {
- sum[0] = Sum3WLo_32(src[0], src[1], src[2]);
- sum[1] = Sum3WHi_32(src[0], src[1], src[2]);
- return sum;
-}
-
-template <int index>
-inline __m128i Sum3WLo(const __m128i src[3][2]) {
- return Sum3WLo_32(src[0][index], src[1][index], src[2][index]);
-}
-
-inline __m128i Sum3WHi(const __m128i src[3][2]) {
- return Sum3WHi_32(src[0][0], src[1][0], src[2][0]);
-}
-
-inline __m128i* Sum3W(const __m128i src[3][2], __m128i sum[3]) {
- sum[0] = Sum3WLo<0>(src);
- sum[1] = Sum3WHi(src);
- sum[2] = Sum3WLo<1>(src);
- return sum;
+inline __m128i Sum3WHi32(const __m128i src[3]) {
+ const __m128i sum = VaddlHi16(src[0], src[1]);
+ return VaddwHi16(sum, src[2]);
}
inline __m128i Sum5_16(const __m128i src[5]) {
@@ -816,323 +858,993 @@
return _mm_add_epi16(sum, src[4]);
}
-inline __m128i Sum5_32(const __m128i src[5]) {
- const __m128i sum01 = _mm_add_epi32(src[0], src[1]);
- const __m128i sum23 = _mm_add_epi32(src[2], src[3]);
+inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
+ const __m128i* const src2, const __m128i* const src3,
+ const __m128i* const src4) {
+ const __m128i sum01 = _mm_add_epi32(*src0, *src1);
+ const __m128i sum23 = _mm_add_epi32(*src2, *src3);
const __m128i sum = _mm_add_epi32(sum01, sum23);
- return _mm_add_epi32(sum, src[4]);
+ return _mm_add_epi32(sum, *src4);
}
-inline __m128i Sum5WLo_16(const __m128i src[5]) {
+inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
+ dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
+ dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
+}
+
+inline __m128i Sum5WLo16(const __m128i src[5]) {
const __m128i sum01 = VaddlLo8(src[0], src[1]);
const __m128i sum23 = VaddlLo8(src[2], src[3]);
const __m128i sum = _mm_add_epi16(sum01, sum23);
return VaddwLo8(sum, src[4]);
}
-inline __m128i Sum5WHi_16(const __m128i src[5]) {
+inline __m128i Sum5WHi16(const __m128i src[5]) {
const __m128i sum01 = VaddlHi8(src[0], src[1]);
const __m128i sum23 = VaddlHi8(src[2], src[3]);
const __m128i sum = _mm_add_epi16(sum01, sum23);
return VaddwHi8(sum, src[4]);
}
-inline __m128i Sum5WLo_32(const __m128i src[5]) {
- const __m128i sum01 = VaddlLo16(src[0], src[1]);
- const __m128i sum23 = VaddlLo16(src[2], src[3]);
- const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
- return VaddwLo16(sum0123, src[4]);
-}
-
-inline __m128i Sum5WHi_32(const __m128i src[5]) {
- const __m128i sum01 = VaddlHi16(src[0], src[1]);
- const __m128i sum23 = VaddlHi16(src[2], src[3]);
- const __m128i sum0123 = _mm_add_epi32(sum01, sum23);
- return VaddwHi16(sum0123, src[4]);
-}
-
-inline __m128i* Sum5W_16D(const __m128i src[5], __m128i sum[2]) {
- sum[0] = Sum5WLo_16(src);
- sum[1] = Sum5WHi_16(src);
- return sum;
-}
-
-inline __m128i* Sum5W_32x2(const __m128i src[5], __m128i sum[2]) {
- sum[0] = Sum5WLo_32(src);
- sum[1] = Sum5WHi_32(src);
- return sum;
-}
-
-template <int index>
-inline __m128i Sum5WLo(const __m128i src[5][2]) {
- __m128i s[5];
- s[0] = src[0][index];
- s[1] = src[1][index];
- s[2] = src[2][index];
- s[3] = src[3][index];
- s[4] = src[4][index];
- return Sum5WLo_32(s);
-}
-
-inline __m128i Sum5WHi(const __m128i src[5][2]) {
- __m128i s[5];
- s[0] = src[0][0];
- s[1] = src[1][0];
- s[2] = src[2][0];
- s[3] = src[3][0];
- s[4] = src[4][0];
- return Sum5WHi_32(s);
-}
-
-inline __m128i* Sum5W_32x3(const __m128i src[5][2], __m128i sum[3]) {
- sum[0] = Sum5WLo<0>(src);
- sum[1] = Sum5WHi(src);
- sum[2] = Sum5WLo<1>(src);
- return sum;
-}
-
inline __m128i Sum3Horizontal(const __m128i src) {
- const auto left = src;
- const auto middle = _mm_srli_si128(src, 2);
- const auto right = _mm_srli_si128(src, 4);
- return Sum3_16(left, middle, right);
+ __m128i s[3];
+ Prepare3Lo8(src, s);
+ return Sum3WLo16(s);
}
-inline __m128i Sum3Horizontal_32(const __m128i src[2]) {
- const auto left = src[0];
- const auto middle = _mm_alignr_epi8(src[1], src[0], 4);
- const auto right = _mm_alignr_epi8(src[1], src[0], 8);
- return Sum3_32(left, middle, right);
+template <int offset>
+inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_8<offset>(src, s);
+ dst[0] = Sum3WLo16(s);
+ dst[1] = Sum3WHi16(s);
}
-inline __m128i Sum3HorizontalOffset1(const __m128i src) {
- const auto left = _mm_srli_si128(src, 2);
- const auto middle = _mm_srli_si128(src, 4);
- const auto right = _mm_srli_si128(src, 6);
- return Sum3_16(left, middle, right);
-}
-
-inline __m128i Sum3HorizontalOffset1_16(const __m128i src[2]) {
- const auto left = _mm_alignr_epi8(src[1], src[0], 2);
- const auto middle = _mm_alignr_epi8(src[1], src[0], 4);
- const auto right = _mm_alignr_epi8(src[1], src[0], 6);
- return Sum3_16(left, middle, right);
-}
-
-inline __m128i Sum3HorizontalOffset1_32(const __m128i src[2]) {
- const auto left = _mm_alignr_epi8(src[1], src[0], 4);
- const auto middle = _mm_alignr_epi8(src[1], src[0], 8);
- const auto right = _mm_alignr_epi8(src[1], src[0], 12);
- return Sum3_32(left, middle, right);
-}
-
-inline void Sum3HorizontalOffset1_32x2(const __m128i src[3], __m128i sum[2]) {
- sum[0] = Sum3HorizontalOffset1_32(src + 0);
- sum[1] = Sum3HorizontalOffset1_32(src + 1);
+inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum3WLo32(s);
+ dst[1] = Sum3WHi32(s);
}
inline __m128i Sum5Horizontal(const __m128i src) {
__m128i s[5];
- s[0] = src;
- s[1] = _mm_srli_si128(src, 2);
- s[2] = _mm_srli_si128(src, 4);
- s[3] = _mm_srli_si128(src, 6);
- s[4] = _mm_srli_si128(src, 8);
- return Sum5_16(s);
+ Prepare5Lo8(src, s);
+ return Sum5WLo16(s);
}
-inline __m128i Sum5Horizontal_16(const __m128i src[2]) {
+template <int offset>
+inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
+ __m128i* const dst1) {
__m128i s[5];
- s[0] = src[0];
- s[1] = _mm_alignr_epi8(src[1], src[0], 2);
- s[2] = _mm_alignr_epi8(src[1], src[0], 4);
- s[3] = _mm_alignr_epi8(src[1], src[0], 6);
- s[4] = _mm_alignr_epi8(src[1], src[0], 8);
- return Sum5_16(s);
+ Prepare5_8<offset>(src, s);
+ *dst0 = Sum5WLo16(s);
+ *dst1 = Sum5WHi16(s);
}
-inline __m128i Sum5Horizontal_32(const __m128i src[2]) {
+inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
__m128i s[5];
- s[0] = src[0];
- s[1] = _mm_alignr_epi8(src[1], src[0], 4);
- s[2] = _mm_alignr_epi8(src[1], src[0], 8);
- s[3] = _mm_alignr_epi8(src[1], src[0], 12);
- s[4] = src[1];
- return Sum5_32(s);
+ Prepare5_16(src, s);
+ const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
+ const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
+ const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
+ dst[0] = VaddwLo16(sum0123_lo, s[4]);
+ const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
+ const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
+ const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
+ dst[1] = VaddwHi16(sum0123_hi, s[4]);
}
-inline __m128i* Sum5Horizontal_32x2(const __m128i src[3], __m128i sum[2]) {
+void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlLo16(src[0], src[4]);
+ *row_sq3 = Sum3WLo32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
+ __m128i* const row_sq5) {
+ const __m128i sum04 = VaddlHi16(src[0], src[4]);
+ *row_sq3 = Sum3WHi32(src + 1);
+ *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
+}
+
+void SumHorizontalLo(const __m128i src, __m128i* const row3,
+ __m128i* const row5) {
__m128i s[5];
- s[0] = src[0];
- s[1] = _mm_alignr_epi8(src[1], src[0], 4);
- s[2] = _mm_alignr_epi8(src[1], src[0], 8);
- s[3] = _mm_alignr_epi8(src[1], src[0], 12);
- s[4] = src[1];
- sum[0] = Sum5_32(s);
- s[0] = src[1];
- s[1] = _mm_alignr_epi8(src[2], src[1], 4);
- s[2] = _mm_alignr_epi8(src[2], src[1], 8);
- s[3] = _mm_alignr_epi8(src[2], src[1], 12);
- s[4] = src[2];
- sum[1] = Sum5_32(s);
- return sum;
+ Prepare5Lo8(src, s);
+ const __m128i sum04 = VaddlLo8(s[0], s[4]);
+ *row3 = Sum3WLo16(s + 1);
+ *row5 = _mm_add_epi16(sum04, *row3);
}
-template <int size, int offset>
-inline void BoxFilterPreProcess4(const __m128i* const row,
- const __m128i* const row_sq, const uint32_t s,
- uint16_t* const dst) {
- static_assert(size == 3 || size == 5, "");
- static_assert(offset == 0 || offset == 1, "");
- // Number of elements in the box being summed.
- constexpr uint32_t n = size * size;
- constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- __m128i sum, sum_sq;
- if (size == 3) {
- __m128i temp32[2];
- if (offset == 0) {
- sum = Sum3Horizontal(Sum3WLo_16(row));
- sum_sq = Sum3Horizontal_32(Sum3W(row_sq, temp32));
- } else {
- sum = Sum3HorizontalOffset1(Sum3WLo_16(row));
- sum_sq = Sum3HorizontalOffset1_32(Sum3W(row_sq, temp32));
- }
- }
- if (size == 5) {
- __m128i temp[2];
- sum = Sum5Horizontal(Sum5WLo_16(row));
- sum_sq = Sum5Horizontal_32(Sum5W_32x2(row_sq, temp));
- }
- const __m128i sum_32 = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
- const __m128i z0 = CalculateMa<n>(sum_sq, sum_32, s);
- const __m128i z1 = _mm_packus_epi32(z0, z0);
- const __m128i z = _mm_min_epu16(z1, _mm_set1_epi16(255));
- __m128i ma = _mm_setzero_si128();
- ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<0>(z)], 4);
- ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<1>(z)], 5);
- ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<2>(z)], 6);
- ma = _mm_insert_epi8(ma, kSgrMaLookup[VgetLane16<3>(z)], 7);
- const __m128i b = CalculateIntermediate4(ma, sum_32, one_over_n);
- const __m128i ma_b = _mm_unpacklo_epi64(ma, b);
- StoreAligned16(dst, ma_b);
+template <int offset>
+void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
+ __m128i* const row3_1, __m128i* const row5_0,
+ __m128i* const row5_1) {
+ __m128i s[5];
+ Prepare5_8<offset>(src, s);
+ const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
+ const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
+ *row3_0 = Sum3WLo16(s + 1);
+ *row3_1 = Sum3WHi16(s + 1);
+ *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
+ *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
+}
+
+inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
+ __m128i* const row_sq3_1, __m128i* const row_sq5_0,
+ __m128i* const row_sq5_1) {
+ __m128i s[5];
+ Prepare5_16(src, s);
+ SumHorizontalLo(s, row_sq3_0, row_sq5_0);
+ SumHorizontalHi(s, row_sq3_1, row_sq5_1);
+}
+
+inline __m128i Sum343Lo(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WLo16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwLo8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343Hi(const __m128i ma3[3]) {
+ const __m128i sum = Sum3WHi16(ma3);
+ const __m128i sum3 = Sum3_16(sum, sum, sum);
+ return VaddwHi8(sum3, ma3[1]);
+}
+
+inline __m128i Sum343WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwLo16(sum3, src[1]);
+}
+
+inline __m128i Sum343WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum3 = Sum3_32(sum, sum, sum);
+ return VaddwHi16(sum3, src[1]);
+}
+
+inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum343WLo(s);
+ dst[1] = Sum343WHi(s);
+}
+
+inline __m128i Sum565Lo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwLo8(sum5, src[1]);
+}
+
+inline __m128i Sum565Hi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi16(src);
+ const __m128i sum4 = _mm_slli_epi16(sum, 2);
+ const __m128i sum5 = _mm_add_epi16(sum4, sum);
+ return VaddwHi8(sum5, src[1]);
+}
+
+inline __m128i Sum565WLo(const __m128i src[3]) {
+ const __m128i sum = Sum3WLo32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwLo16(sum5, src[1]);
+}
+
+inline __m128i Sum565WHi(const __m128i src[3]) {
+ const __m128i sum = Sum3WHi32(src);
+ const __m128i sum4 = _mm_slli_epi32(sum, 2);
+ const __m128i sum5 = _mm_add_epi32(sum4, sum);
+ return VaddwHi16(sum5, src[1]);
+}
+
+inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
+ __m128i s[3];
+ Prepare3_16(src, s);
+ dst[0] = Sum565WLo(s);
+ dst[1] = Sum565WHi(s);
+}
+
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
+ uint32_t* square_sum3, uint32_t* square_sum5) {
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src,
+ sum_width - x + kOverreadInBytesPass1 - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
+ StoreAligned32U16(sum3, row3);
+ StoreAligned32U16(sum5, row5);
+ SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 0, row_sq3);
+ StoreAligned32U32(square_sum5 + 0, row_sq5);
+ SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
+ StoreAligned32U32(square_sum3 + 8, row_sq3);
+ StoreAligned32U32(square_sum5 + 8, row_sq5);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sum3 += 16;
+ sum5 += 16;
+ square_sum3 += 16;
+ square_sum5 += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sum3 += sum_stride - sum_width;
+ sum5 += sum_stride - sum_width;
+ square_sum3 += sum_stride - sum_width;
+ square_sum5 += sum_stride - sum_width;
+ } while (--y != 0);
}
template <int size>
-inline void BoxFilterPreProcess8(const __m128i* const row,
- const __m128i row_sq[][2], const uint32_t s,
- __m128i* const ma, __m128i* const b,
- uint16_t* const dst) {
+inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
+ const ptrdiff_t width, const ptrdiff_t sum_stride,
+ const ptrdiff_t sum_width, uint16_t* sums,
+ uint32_t* square_sums) {
static_assert(size == 3 || size == 5, "");
- // Number of elements in the box being summed.
- constexpr uint32_t n = size * size;
- constexpr uint32_t one_over_n =
- ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
- __m128i sum, sum_sq[2];
- if (size == 3) {
- __m128i temp16[2], temp32[3];
- sum = Sum3HorizontalOffset1_16(Sum3W_16x2(row, temp16));
- Sum3HorizontalOffset1_32x2(Sum3W(row_sq, temp32), sum_sq);
- }
- if (size == 5) {
- __m128i temp16[2], temp32[3];
- sum = Sum5Horizontal_16(Sum5W_16D(row, temp16));
- Sum5Horizontal_32x2(Sum5W_32x3(row_sq, temp32), sum_sq);
- }
+ constexpr int kOverreadInBytes =
+ (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
+ int y = 2;
+ do {
+ __m128i s[2], sq[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
+ sq[0] = SquareLo8(s[0]);
+ ptrdiff_t x = sum_width;
+ do {
+ __m128i row[2], row_sq[4];
+ x -= 16;
+ src += 16;
+ s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
+ sq[1] = SquareHi8(s[0]);
+ sq[2] = SquareLo8(s[1]);
+ if (size == 3) {
+ Sum3Horizontal<0>(s, row);
+ Sum3WHorizontal(sq + 0, row_sq + 0);
+ Sum3WHorizontal(sq + 1, row_sq + 2);
+ } else {
+ Sum5Horizontal<0>(s, &row[0], &row[1]);
+ Sum5WHorizontal(sq + 0, row_sq + 0);
+ Sum5WHorizontal(sq + 1, row_sq + 2);
+ }
+ StoreAligned32U16(sums, row);
+ StoreAligned64U32(square_sums, row_sq);
+ s[0] = s[1];
+ sq[0] = sq[2];
+ sums += 16;
+ square_sums += 16;
+ } while (x != 0);
+ src += src_stride - sum_width;
+ sums += sum_stride - sum_width;
+ square_sums += sum_stride - sum_width;
+ } while (--y != 0);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
+ // a = |sum_sq|
+ // d = |sum|
+ // p = (a * n < d * d) ? 0 : a * n - d * d;
+ const __m128i dxd = _mm_madd_epi16(sum, sum);
+ // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
+ // Some compilers could do this for us but we make this explicit.
+ // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
+ __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
+ if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
+ const __m128i sub = _mm_sub_epi32(axn, dxd);
+ const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
+ const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
+ return VrshrU32(pxs, kSgrProjScaleBits);
+}
+
+template <int n>
+inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
+ const uint32_t scale) {
+ static_assert(n == 9 || n == 25, "");
const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
- const __m128i z0 = CalculateMa<n>(sum_sq[0], sum_lo, s);
const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
- const __m128i z1 = CalculateMa<n>(sum_sq[1], sum_hi, s);
- const __m128i z01 = _mm_packus_epi32(z0, z1);
- const __m128i z = _mm_min_epu16(z01, _mm_set1_epi16(255));
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<0>(z)], 8);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<1>(z)], 9);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<2>(z)], 10);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<3>(z)], 11);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<4>(z)], 12);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<5>(z)], 13);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<6>(z)], 14);
- *ma = _mm_insert_epi8(*ma, kSgrMaLookup[VgetLane16<7>(z)], 15);
- *b = CalculateIntermediate8(*ma, sum, one_over_n);
- const __m128i ma_b = _mm_unpackhi_epi64(*ma, *b);
- StoreAligned16(dst, ma_b);
+ const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
+ const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
+ return _mm_packus_epi32(z0, z1);
}
-inline void Prepare3_8(const __m128i src, __m128i* const left,
- __m128i* const middle, __m128i* const right) {
- *left = _mm_srli_si128(src, 5);
- *middle = _mm_srli_si128(src, 6);
- *right = _mm_srli_si128(src, 7);
+inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
+ // one_over_n == 164.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
+ // one_over_n_quarter == 41.
+ constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
+ static_assert(one_over_n == one_over_n_quarter << 2, "");
+ // |ma| is in range [0, 255].
+ const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
+ const __m128i m0 = VmullLo16(m, sum);
+ const __m128i m1 = VmullHi16(m, sum);
+ const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
+ const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
+ return _mm_packus_epi32(b_lo, b_hi);
}
-inline void Prepare3_16(const __m128i src[2], __m128i* const left,
- __m128i* const middle, __m128i* const right) {
- *left = _mm_alignr_epi8(src[1], src[0], 10);
- *middle = _mm_alignr_epi8(src[1], src[0], 12);
- *right = _mm_alignr_epi8(src[1], src[0], 14);
+inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
+ // one_over_n == 455.
+ constexpr uint32_t one_over_n =
+ ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
+ const __m128i m0 = VmullLo16(ma, sum);
+ const __m128i m1 = VmullHi16(ma, sum);
+ const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
+ const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
+ const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
+ const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
+ return _mm_packus_epi32(b_lo, b_hi);
}
-inline __m128i Sum343(const __m128i src) {
- __m128i left, middle, right;
- Prepare3_8(src, &left, &middle, &right);
- const auto sum = Sum3W_16(left, middle, right);
- const auto sum3 = Sum3_16(sum, sum, sum);
- return VaddwLo8(sum3, middle);
+inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum5_16(s5);
+ Sum5_32(sq5, sum_sq);
+ *index = CalculateMa<25>(*sum, sum_sq, scale);
}
-inline void Sum343_444(const __m128i src, __m128i* const sum343,
- __m128i* const sum444) {
- __m128i left, middle, right;
- Prepare3_8(src, &left, &middle, &right);
- const auto sum111 = Sum3W_16(left, middle, right);
- *sum444 = _mm_slli_epi16(sum111, 2);
- const __m128i sum333 = _mm_sub_epi16(*sum444, sum111);
- *sum343 = VaddwLo8(sum333, middle);
+inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const sum,
+ __m128i* const index) {
+ __m128i sum_sq[2];
+ *sum = Sum3_16(s3);
+ Sum3_32(sq3, sum_sq);
+ *index = CalculateMa<9>(*sum, sum_sq, scale);
}
-inline __m128i* Sum343W(const __m128i src[2], __m128i d[2]) {
- __m128i left, middle, right;
- Prepare3_16(src, &left, &middle, &right);
- d[0] = Sum3WLo_32(left, middle, right);
- d[1] = Sum3WHi_32(left, middle, right);
- d[0] = Sum3_32(d[0], d[0], d[0]);
- d[1] = Sum3_32(d[1], d[1], d[1]);
- d[0] = VaddwLo16(d[0], middle);
- d[1] = VaddwHi16(d[1], middle);
- return d;
+template <int n, int offset>
+inline void LookupIntermediate(const __m128i sum, const __m128i index,
+ __m128i* const ma, __m128i* const b) {
+ static_assert(n == 9 || n == 25, "");
+ static_assert(offset == 0 || offset == 8, "");
+ const __m128i idx = _mm_packus_epi16(index, index);
+ // Actually it's not stored and loaded. The compiler will use a 64-bit
+ // general-purpose register to process. Faster than using _mm_extract_epi8().
+ uint8_t temp[8];
+ StoreLo8(temp, idx);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
+ *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ __m128i maq;
+ if (offset == 0) {
+ maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ } else {
+ maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ }
+ *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
}
-inline void Sum343_444W(const __m128i src[2], __m128i sum343[2],
- __m128i sum444[2]) {
- __m128i left, middle, right, sum111[2];
- Prepare3_16(src, &left, &middle, &right);
- sum111[0] = Sum3WLo_32(left, middle, right);
- sum111[1] = Sum3WHi_32(left, middle, right);
- sum444[0] = _mm_slli_epi32(sum111[0], 2);
- sum444[1] = _mm_slli_epi32(sum111[1], 2);
- sum343[0] = _mm_sub_epi32(sum444[0], sum111[0]);
- sum343[1] = _mm_sub_epi32(sum444[1], sum111[1]);
- sum343[0] = VaddwLo16(sum343[0], middle);
- sum343[1] = VaddwHi16(sum343[1], middle);
+// Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
+// to get value 0 as the shuffle result. The most significiant bit 1 comes
+// either from the comparison instruction, or from the sign bit of the index.
+inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
+ __m128i mask;
+ mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
+ mask = _mm_or_si128(mask, index);
+ return _mm_shuffle_epi8(table, mask);
}
-inline __m128i Sum565(const __m128i src) {
- __m128i left, middle, right;
- Prepare3_8(src, &left, &middle, &right);
- const auto sum = Sum3W_16(left, middle, right);
- const auto sum4 = _mm_slli_epi16(sum, 2);
- const auto sum5 = _mm_add_epi16(sum4, sum);
- return VaddwLo8(sum5, middle);
+inline __m128i AdjustValue(const __m128i value, const __m128i index,
+ const int threshold) {
+ const __m128i thresholds = _mm_set1_epi8(threshold - 128);
+ const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
+ return _mm_add_epi8(value, offset);
}
-inline __m128i Sum565W(const __m128i src) {
- const auto left = _mm_srli_si128(src, 2);
- const auto middle = _mm_srli_si128(src, 4);
- const auto right = _mm_srli_si128(src, 6);
- const auto sum = Sum3WLo_32(left, middle, right);
- const auto sum4 = _mm_slli_epi32(sum, 2);
- const auto sum5 = _mm_add_epi32(sum4, sum);
- return VaddwLo16(sum5, middle);
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i* const ma, __m128i* const b0,
+ __m128i* const b1) {
+ // Use table lookup to read elements whose indices are less than 48.
+ const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
+ const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
+ const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
+ const __m128i indices = _mm_packus_epi16(index[0], index[1]);
+ __m128i idx;
+ // Clip idx to 127 to apply signed comparison instructions.
+ idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
+ // All elements whose indices are less than 48 are set to 0.
+ // Get shuffle results for indices in range [0, 15].
+ *ma = ShuffleIndex(c0, idx);
+ // Get shuffle results for indices in range [16, 31].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res1 = ShuffleIndex(c1, idx);
+ // Use OR instruction to combine shuffle results together.
+ *ma = _mm_or_si128(*ma, res1);
+ // Get shuffle results for indices in range [32, 47].
+ // Subtract 16 to utilize the sign bit of the index.
+ idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
+ const __m128i res2 = ShuffleIndex(c2, idx);
+ *ma = _mm_or_si128(*ma, res2);
+
+ // For elements whose indices are larger than 47, since they seldom change
+ // values with the increase of the index, we use comparison and arithmetic
+ // operations to calculate their values.
+ // Add -128 to apply signed comparison instructions.
+ idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
+ // Elements whose indices are larger than 47 (with value 0) are set to 5.
+ *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
+ *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
+ *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
+ *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
+ *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
+ *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
+
+ // b = ma * b * one_over_n
+ // |ma| = [0, 255]
+ // |sum| is a box sum with radius 1 or 2.
+ // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
+ // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
+ // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
+ // When radius is 2 |n| is 25. |one_over_n| is 164.
+ // When radius is 1 |n| is 9. |one_over_n| is 455.
+ // |kSgrProjReciprocalBits| is 12.
+ // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
+ // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
+ const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
+ *b0 = CalculateB3(sum[0], maq0);
+ const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
+ *b1 = CalculateB3(sum[1], maq1);
+}
+
+inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
+ __m128i ma[2], __m128i b[2]) {
+ __m128i mas;
+ CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
+ ma[0] = _mm_unpacklo_epi64(ma[0], mas);
+ ma[1] = _mm_srli_si128(mas, 8);
+}
+
+// Note: It has been tried to call CalculateIntermediate() to replace the slow
+// LookupIntermediate() when calculating 16 intermediate data points. However,
+// the compiler generates even slower code.
+template <int offset>
+inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ static_assert(offset == 0 || offset == 8, "");
+ __m128i sum, index;
+ CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
+ LookupIntermediate<25, offset>(sum, index, ma, b);
+}
+
+inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
+ const uint32_t scale, __m128i* const ma,
+ __m128i* const b) {
+ __m128i sum, index;
+ CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
+ LookupIntermediate<9, 0>(sum, index, ma, b);
+}
+
+inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
+ __m128i sum_b343[2], __m128i sum_b444[2],
+ uint32_t* const b343, uint32_t* const b444) {
+ __m128i b[3], sum_b111[2];
+ Prepare3_16(b3, b);
+ sum_b111[0] = Sum3WLo32(b);
+ sum_b111[1] = Sum3WHi32(b);
+ sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
+ sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
+ StoreAligned32U32(b444 + x, sum_b444);
+ sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
+ sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
+ sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
+ sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
+ StoreAligned32U32(b343 + x, sum_b343);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WLo16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwLo8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i* const sum_ma444, __m128i sum_b343[2],
+ __m128i sum_b444[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ const __m128i sum_ma111 = Sum3WHi16(ma3);
+ *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
+ StoreAligned16(ma444 + x, *sum_ma444);
+ const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
+ *sum_ma343 = VaddwHi8(sum333, ma3[1]);
+ StoreAligned16(ma343 + x, *sum_ma343);
+ Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, __m128i* const sum_ma343,
+ __m128i sum_b343[2], uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma444, sum_b444[2];
+ Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
+ ma444, b343, b444);
+}
+
+inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
+ const ptrdiff_t x, uint16_t* const ma343,
+ uint16_t* const ma444, uint32_t* const b343,
+ uint32_t* const b444) {
+ __m128i sum_ma343, sum_b343[2];
+ Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
+ const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ s5[0][3] = Sum5Horizontal(s[0][0]);
+ StoreAligned16(sum5[3], s5[0][3]);
+ s5[0][4] = Sum5Horizontal(s[1][0]);
+ StoreAligned16(sum5[4], s5[0][4]);
+ Sum5WHorizontal(sq[0], sq5[3]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ Sum5WHorizontal(sq[1], sq5[4]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x3U16(sum5, 0, s5[0]);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
+ const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ Sum5WHorizontal(sq[0] + 1, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ Sum5WHorizontal(sq[1] + 1, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ Sum5WHorizontal(sq[0] + 2, sq5[3]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ Sum5WHorizontal(sq[1] + 2, sq5[4]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
+ const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s5[5], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ s5[3] = s5[4] = Sum5Horizontal(s);
+ Sum5WHorizontal(sq, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint32_t scale, const uint16_t* const sum5[5],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s5[2][5], sq5[5][2];
+ sq[2] = SquareLo8(s[1]);
+ Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
+ s5[0][4] = s5[0][3];
+ s5[1][4] = s5[1][3];
+ Sum5WHorizontal(sq + 1, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum5WHorizontal(sq + 2, sq5[3]);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
+ const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
+ __m128i* const b) {
+ __m128i s3[3], sq3[3][2];
+ sq[1] = SquareHi8(s);
+ s3[2] = Sum3Horizontal(s);
+ StoreAligned16(sum3[2], s3[2]);
+ Sum3WHorizontal(sq, sq3[2]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scale, ma, b);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
+ const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
+ const uint32_t scale, uint16_t* const sum3[3],
+ uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
+ __m128i b[3]) {
+ __m128i s3[4], sq3[3][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ Sum3Horizontal<8>(s, s3 + 2);
+ StoreAligned32U16(sum3[2] + x, s3 + 2);
+ Sum3WHorizontal(sq + 1, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
+ LoadAligned16x2U16(sum3, x, s3);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ Sum3WHorizontal(sq + 2, sq3[2]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma, b + 1);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
+ const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
+ uint16_t* const sum5[5], uint32_t* const square_sum3[4],
+ uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
+ __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
+ sq[0][1] = SquareHi8(s[0][0]);
+ sq[1][1] = SquareHi8(s[1][0]);
+ SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
+ SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
+ StoreAligned16(sum3[2], s3[2]);
+ StoreAligned16(sum3[3], s3[3]);
+ StoreAligned16(sum5[3], s5[3]);
+ StoreAligned16(sum5[4], s5[4]);
+ SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2], sq3[2]);
+ StoreAligned32U32(square_sum5[3], sq5[3]);
+ SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3], sq3[3]);
+ StoreAligned32U32(square_sum5[4], sq5[4]);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ LoadAligned16x3U16(sum5, 0, s5);
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
+ CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
+ ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
+ const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
+ __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
+ __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
+ SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ StoreAligned16(sum3[2] + x + 0, s3[0][2]);
+ StoreAligned16(sum3[2] + x + 8, s3[1][2]);
+ StoreAligned16(sum5[3] + x + 0, s5[0][3]);
+ StoreAligned16(sum5[3] + x + 8, s5[1][3]);
+ SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
+ StoreAligned16(sum3[3] + x + 0, s3[0][3]);
+ StoreAligned16(sum3[3] + x + 8, s3[1][3]);
+ StoreAligned16(sum5[4] + x + 0, s5[0][4]);
+ StoreAligned16(sum5[4] + x + 8, s5[1][4]);
+ sq[0][2] = SquareLo8(s[0][1]);
+ sq[1][2] = SquareLo8(s[1][1]);
+ SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x, sq5[3]);
+ SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x, sq5[4]);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
+ CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
+ &index[1][0]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
+
+ sq[0][3] = SquareHi8(s[0][1]);
+ sq[1][3] = SquareHi8(s[1][1]);
+ SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
+ StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
+ SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
+ StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
+ StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
+ CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
+ &index[1][1]);
+ CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
+ CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
+ const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
+ __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
+ __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
+ sq[1] = SquareHi8(s);
+ SumHorizontalLo(s, &s3[2], &s5[3]);
+ SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, 0, s5);
+ s5[4] = s5[3];
+ LoadAligned32x3U32(square_sum5, 0, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
+ LoadAligned16x2U16(sum3, 0, s3);
+ LoadAligned32x2U32(square_sum3, 0, sq3);
+ CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
+ const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
+ const uint16_t scales[2], const uint16_t* const sum3[4],
+ const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
+ const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
+ __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
+ __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
+ sq[2] = SquareLo8(s[1]);
+ SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
+ SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16(sum5, x, s5[0]);
+ s5[0][4] = s5[0][3];
+ LoadAligned32x3U32(square_sum5, x, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
+ LoadAligned16x2U16(sum3, x, s3[0]);
+ LoadAligned32x2U32(square_sum3, x, sq3);
+ CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
+
+ sq[3] = SquareHi8(s[1]);
+ SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
+ LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
+ s5[1][4] = s5[1][3];
+ LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
+ sq5[4][0] = sq5[3][0];
+ sq5[4][1] = sq5[3][1];
+ CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
+ LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
+ LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
+ CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
+ CalculateIntermediate(sum, index, ma3, b3 + 1);
+}
+
+inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
+ const uint8_t* const src1, const int width,
+ const uint32_t scale,
+ uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* ma565,
+ uint32_t* b565) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ __m128i ma5[3], ma[2], b[4];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[0] = Sum565Lo(ma5);
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned32U16(ma565, ma);
+ Sum565W(bs + 0, b + 0);
+ Sum565W(bs + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+template <bool calculate444>
+LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
+ const uint8_t* const src, const int width, const uint32_t scale,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
+ uint32_t* b444) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
+ Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
+ Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
+ ma444 += 16;
+ b444 += 16;
+ } else {
+ __m128i ma[2], b[4];
+ ma[0] = Sum343Lo(ma3);
+ ma[1] = Sum343Hi(ma3);
+ StoreAligned32U16(ma343, ma);
+ Sum343W(bs + 0, b + 0);
+ Sum343W(bs + 1, b + 2);
+ StoreAligned64U32(b343, b);
+ }
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma343 += 16;
+ b343 += 16;
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxSumFilterPreProcess(
+ const uint8_t* const src0, const uint8_t* const src1, const int width,
+ const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
+ uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
+ uint32_t* b565) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[2], b[4], ma3x[3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+
+ Prepare3_8<0>(ma3[0], ma3x);
+ ma[0] = Sum343Lo(ma3x);
+ ma[1] = Sum343Hi(ma3x);
+ StoreAligned32U16(ma343[0] + x, ma);
+ Sum343W(b3[0] + 0, b + 0);
+ Sum343W(b3[0] + 1, b + 2);
+ StoreAligned64U32(b343[0] + x, b);
+ Sum565W(b5 + 0, b + 0);
+ Sum565W(b5 + 1, b + 2);
+ StoreAligned64U32(b565, b);
+ Prepare3_8<0>(ma3[1], ma3x);
+ Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
+ Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[0] = Sum565Lo(ma5x);
+ ma[1] = Sum565Hi(ma5x);
+ StoreAligned32U16(ma565, ma);
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
+ } while (x < width);
}
template <int shift>
@@ -1149,635 +1861,699 @@
}
template <int shift>
-inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i a,
+inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
const __m128i b[2]) {
- const __m128i src_u16 = _mm_unpacklo_epi8(src, _mm_setzero_si128());
- const __m128i ma_x_src_lo = VmullLo16(a, src_u16);
- const __m128i ma_x_src_hi = VmullHi16(a, src_u16);
+ const __m128i ma_x_src_lo = VmullLo16(ma, src);
+ const __m128i ma_x_src_hi = VmullHi16(ma, src);
const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
}
-inline __m128i BoxFilterPass1(const __m128i src_u8, const __m128i ma,
- const __m128i b[2], __m128i ma565[2],
- __m128i b565[2][2]) {
+inline __m128i CalculateFilteredOutputPass1(const __m128i src,
+ const __m128i ma[2],
+ const __m128i b[2][2]) {
+ const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
__m128i b_sum[2];
- ma565[1] = Sum565(ma);
- b565[1][0] = Sum565W(_mm_alignr_epi8(b[1], b[0], 8));
- b565[1][1] = Sum565W(b[1]);
- __m128i ma_sum = _mm_add_epi16(ma565[0], ma565[1]);
- b_sum[0] = _mm_add_epi32(b565[0][0], b565[1][0]);
- b_sum[1] = _mm_add_epi32(b565[0][1], b565[1][1]);
- return CalculateFilteredOutput<5>(src_u8, ma_sum, b_sum); // 13 bits
+ b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
+ b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
}
-inline __m128i BoxFilterPass2(const __m128i src_u8, const __m128i ma,
- const __m128i b[2], __m128i ma343[4],
- __m128i ma444[3], __m128i b343[4][2],
- __m128i b444[3][2]) {
+inline __m128i CalculateFilteredOutputPass2(const __m128i src,
+ const __m128i ma[3],
+ const __m128i b[3][2]) {
+ const __m128i ma_sum = Sum3_16(ma);
__m128i b_sum[2];
- Sum343_444(ma, &ma343[2], &ma444[1]);
- __m128i ma_sum = Sum3_16(ma343[0], ma444[0], ma343[2]);
- Sum343_444W(b, b343[2], b444[1]);
- b_sum[0] = Sum3_32(b343[0][0], b444[0][0], b343[2][0]);
- b_sum[1] = Sum3_32(b343[0][1], b444[0][1], b343[2][1]);
- return CalculateFilteredOutput<5>(src_u8, ma_sum, b_sum); // 13 bits
+ Sum3_32(b, b_sum);
+ return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
}
-inline void SelfGuidedFinal(const __m128i src, const __m128i v[2],
- uint8_t* const dst) {
+inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
const __m128i v_lo =
VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const __m128i v_hi =
VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
- const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
- const __m128i d = _mm_add_epi16(s, vv);
- StoreLo8(dst, _mm_packus_epi16(d, d));
+ return _mm_add_epi16(src, vv);
}
-inline void SelfGuidedDoubleMultiplier(const __m128i src,
- const __m128i filter[2], const int w0,
- const int w2, uint8_t* const dst) {
+inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
+ const __m128i filter[2], const int w0,
+ const int w2) {
__m128i v[2];
const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
v[0] = _mm_madd_epi16(w0_w2, f_lo);
v[1] = _mm_madd_epi16(w0_w2, f_hi);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void SelfGuidedSingleMultiplier(const __m128i src, const __m128i filter,
- const int w0, uint8_t* const dst) {
+inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
+ const __m128i filter, const int w0) {
// weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
__m128i v[2];
v[0] = VmullNLo8(filter, w0);
v[1] = VmullNHi8(filter, w0);
- SelfGuidedFinal(src, v, dst);
+ return SelfGuidedFinal(src, v);
}
-inline void BoxFilterProcess(const uint8_t* const src,
- const ptrdiff_t src_stride,
- const RestorationUnitInfo& restoration_info,
- const int width, const int height,
- const uint16_t scale[2], uint16_t* const temp,
- uint8_t* const dst, const ptrdiff_t dst_stride) {
- // We have combined PreProcess and Process for the first pass by storing
- // intermediate values in the |ma| region. The values stored are one
- // vertical column of interleaved |ma| and |b| values and consume 8 *
- // |height| values. This is |height| and not |height| * 2 because PreProcess
- // only generates output for every other row. When processing the next column
- // we write the new scratch values right after reading the previously saved
- // ones.
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
+ uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
+ const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
+ uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], mas[2], sq[2][4], bs[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
- // The PreProcess phase calculates a 5x5 box sum for every other row
- //
- // PreProcess and Process have been combined into the same step. We need 12
- // input values to generate 8 output values for PreProcess:
- // 0 1 2 3 4 5 6 7 8 9 10 11
- // 2 = 0 + 1 + 2 + 3 + 4
- // 3 = 1 + 2 + 3 + 4 + 5
- // 4 = 2 + 3 + 4 + 5 + 6
- // 5 = 3 + 4 + 5 + 6 + 7
- // 6 = 4 + 5 + 6 + 7 + 8
- // 7 = 5 + 6 + 7 + 8 + 9
- // 8 = 6 + 7 + 8 + 9 + 10
- // 9 = 7 + 8 + 9 + 10 + 11
- //
- // and then we need 10 input values to generate 8 output values for Process:
- // 0 1 2 3 4 5 6 7 8 9
- // 1 = 0 + 1 + 2
- // 2 = 1 + 2 + 3
- // 3 = 2 + 3 + 4
- // 4 = 3 + 4 + 5
- // 5 = 4 + 5 + 6
- // 6 = 5 + 6 + 7
- // 7 = 6 + 7 + 8
- // 8 = 7 + 8 + 9
- //
- // To avoid re-calculating PreProcess values over and over again we will do a
- // single column of 8 output values and store the second half of them
- // interleaved in |temp|. The first half is not stored, since it is used
- // immediately and becomes useless for the next column. Next we will start the
- // second column. When 2 rows have been calculated we can calculate Process
- // and output the results.
-
- // Calculate and store a single column. Scope so we can re-use the variable
- // names for the next step.
- uint16_t* ab_ptr = temp;
- const uint8_t* const src_pre_process = src - 2 * src_stride;
- // Calculate intermediate results, including two-pixel border, for example, if
- // unit size is 64x64, we calculate 68x68 pixels.
- {
- const uint8_t* column = src_pre_process - 4;
- __m128i row[5], row_sq[5];
- row[0] = row[1] = LoadLo8(column);
- column += src_stride;
- row[2] = LoadLo8(column);
- row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
- row_sq[2] = VmullLo8(row[2], row[2]);
-
- int y = (height + 2) >> 1;
- do {
- column += src_stride;
- row[3] = LoadLo8(column);
- column += src_stride;
- row[4] = LoadLo8(column);
- row_sq[3] = VmullLo8(row[3], row[3]);
- row_sq[4] = VmullLo8(row[4], row[4]);
- BoxFilterPreProcess4<5, 1>(row + 0, row_sq + 0, scale[0], ab_ptr + 0);
- BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, scale[1], ab_ptr + 8);
- BoxFilterPreProcess4<3, 1>(row + 2, row_sq + 2, scale[1], ab_ptr + 16);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0] = row_sq[2];
- row_sq[1] = row_sq[3];
- row_sq[2] = row_sq[4];
- ab_ptr += 24;
- } while (--y != 0);
-
- if ((height & 1) != 0) {
- column += src_stride;
- row[3] = row[4] = LoadLo8(column);
- row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
- BoxFilterPreProcess4<5, 1>(row + 0, row_sq + 0, scale[0], ab_ptr + 0);
- BoxFilterPreProcess4<3, 1>(row + 1, row_sq + 1, scale[1], ab_ptr + 8);
- }
- }
-
- const int w0 = restoration_info.sgr_proj_info.multiplier[0];
- const int w1 = restoration_info.sgr_proj_info.multiplier[1];
- const int w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
int x = 0;
do {
- // |src_pre_process| is X but we already processed the first column of 4
- // values so we want to start at Y and increment from there.
- // X s s s Y s s
- // s s s s s s s
- // s s i i i i i
- // s s i o o o o
- // s s i o o o o
+ __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
+ bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ StoreAligned16(ma565[1] + x, ma[1]);
+ Sum565W(bs, b[1]);
+ StoreAligned32U32(b565[1] + x, b[1]);
+ sr[0] = LoadAligned16(src + x);
+ sr[1] = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
+ const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
- // Seed the loop with one line of output. Then, inside the loop, for each
- // iteration we can output one even row and one odd row and carry the new
- // line to the next iteration. In the diagram below 'i' values are
- // intermediary values from the first step and '-' values are empty.
- // iiii
- // ---- > even row
- // iiii - odd row
- // ---- > even row
- // iiii
- __m128i ma[2], b[2][2], ma565[2], ma343[4], ma444[3];
- __m128i b565[2][2], b343[4][2], b444[3][2];
- ab_ptr = temp;
- ma[0] = b[0][0] = LoadAligned16(ab_ptr);
- ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
- const uint8_t* column = src_pre_process + x;
- __m128i row[5], row_sq[5][2];
- // Need |width| + 3 pixels, but we read max(|x|) + 16 pixels.
- // Mask max(|x|) + 13 - |width| extra pixels.
- row[0] = row[1] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[2] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[3] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
- row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
- row_sq[2][0] = VmullLo8(row[2], row[2]);
- row_sq[2][1] = VmullHi8(row[2], row[2]);
- row_sq[3][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = VmullHi8(row[3], row[3]);
- row_sq[4][0] = VmullLo8(row[4], row[4]);
- row_sq[4][1] = VmullHi8(row[4], row[4]);
- BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
- BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
- ab_ptr + 8);
-
- // Pass 1 Process. These are the only values we need to propagate between
- // rows.
- ma565[0] = Sum565(ma[0]);
- b565[0][0] = Sum565W(_mm_alignr_epi8(b[0][1], b[0][0], 8));
- b565[0][1] = Sum565W(b[0][1]);
- ma343[0] = Sum343(ma[1]);
- Sum343W(b[1], b343[0]);
- ma[1] = b[1][0] = LoadAligned16(ab_ptr + 16);
- BoxFilterPreProcess8<3>(row + 2, row_sq + 2, scale[1], &ma[1], &b[1][1],
- ab_ptr + 16);
- Sum343_444(ma[1], &ma343[1], &ma444[0]);
- Sum343_444W(b[1], b343[1], b444[0]);
-
- uint8_t* dst_ptr = dst + x;
- // Calculate one output line. Add in the line from the previous pass and
- // output one even row. Sum the new line and output the odd row. Carry the
- // new row into the next pass.
- for (int y = height >> 1; y != 0; --y) {
- ab_ptr += 24;
- ma[0] = b[0][0] = LoadAligned16(ab_ptr);
- ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
- row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
- row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
- column += src_stride;
- row[3] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[3][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = VmullHi8(row[3], row[3]);
- row_sq[4][0] = VmullLo8(row[4], row[4]);
- row_sq[4][1] = VmullHi8(row[4], row[4]);
- BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
- BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
- ab_ptr + 8);
- __m128i p[2];
- p[0] = BoxFilterPass1(row[1], ma[0], b[0], ma565, b565);
- p[1] = BoxFilterPass2(row[1], ma[1], b[1], ma343, ma444, b343, b444);
- SelfGuidedDoubleMultiplier(row[1], p, w0, w2, dst_ptr);
- dst_ptr += dst_stride;
- p[0] = CalculateFilteredOutput<4>(row[2], ma565[1], b565[1]);
- ma[1] = b[1][0] = LoadAligned16(ab_ptr + 16);
- BoxFilterPreProcess8<3>(row + 2, row_sq + 2, scale[1], &ma[1], &b[1][1],
- ab_ptr + 16);
- p[1] = BoxFilterPass2(row[2], ma[1], b[1], ma343 + 1, ma444 + 1, b343 + 1,
- b444 + 1);
- SelfGuidedDoubleMultiplier(row[2], p, w0, w2, dst_ptr);
- dst_ptr += dst_stride;
- ma565[0] = ma565[1];
- b565[0][0] = b565[1][0], b565[0][1] = b565[1][1];
- ma343[0] = ma343[2];
- ma343[1] = ma343[3];
- ma444[0] = ma444[2];
- b343[0][0] = b343[2][0], b343[0][1] = b343[2][1];
- b343[1][0] = b343[3][0], b343[1][1] = b343[3][1];
- b444[0][0] = b444[2][0], b444[0][1] = b444[2][1];
- }
-
- if ((height & 1) != 0) {
- ab_ptr += 24;
- ma[0] = b[0][0] = LoadAligned16(ab_ptr);
- ma[1] = b[1][0] = LoadAligned16(ab_ptr + 8);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
- row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
- row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
- column += src_stride;
- row[3] = row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
- BoxFilterPreProcess8<5>(row, row_sq, scale[0], &ma[0], &b[0][1], ab_ptr);
- BoxFilterPreProcess8<3>(row + 1, row_sq + 1, scale[1], &ma[1], &b[1][1],
- ab_ptr + 8);
- __m128i p[2];
- p[0] = BoxFilterPass1(row[1], ma[0], b[0], ma565, b565);
- p[1] = BoxFilterPass2(row[1], ma[1], b[1], ma343, ma444, b343, b444);
- SelfGuidedDoubleMultiplier(row[1], p, w0, w2, dst_ptr);
- }
- x += 8;
+ ma[1] = Sum565Hi(ma5);
+ StoreAligned16(ma565[1] + x + 8, ma[1]);
+ Sum565W(bs + 1, b[1]);
+ StoreAligned32U32(b565[1] + x + 8, b[1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
+ p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
} while (x < width);
}
-inline void BoxFilterProcessPass1(const uint8_t* const src,
- const ptrdiff_t src_stride,
- const RestorationUnitInfo& restoration_info,
- const int width, const int height,
- const uint32_t scale, uint16_t* const temp,
- uint8_t* const dst,
- const ptrdiff_t dst_stride) {
- // We have combined PreProcess and Process for the first pass by storing
- // intermediate values in the |ma| region. The values stored are one
- // vertical column of interleaved |ma| and |b| values and consume 8 *
- // |height| values. This is |height| and not |height| * 2 because PreProcess
- // only generates output for every other row. When processing the next column
- // we write the new scratch values right after reading the previously saved
- // ones.
+inline void BoxFilterPass1LastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
+ uint32_t* b565, uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
+ &bs[0]);
- // The PreProcess phase calculates a 5x5 box sum for every other row
- //
- // PreProcess and Process have been combined into the same step. We need 12
- // input values to generate 8 output values for PreProcess:
- // 0 1 2 3 4 5 6 7 8 9 10 11
- // 2 = 0 + 1 + 2 + 3 + 4
- // 3 = 1 + 2 + 3 + 4 + 5
- // 4 = 2 + 3 + 4 + 5 + 6
- // 5 = 3 + 4 + 5 + 6 + 7
- // 6 = 4 + 5 + 6 + 7 + 8
- // 7 = 5 + 6 + 7 + 8 + 9
- // 8 = 6 + 7 + 8 + 9 + 10
- // 9 = 7 + 8 + 9 + 10 + 11
- //
- // and then we need 10 input values to generate 8 output values for Process:
- // 0 1 2 3 4 5 6 7 8 9
- // 1 = 0 + 1 + 2
- // 2 = 1 + 2 + 3
- // 3 = 2 + 3 + 4
- // 4 = 3 + 4 + 5
- // 5 = 4 + 5 + 6
- // 6 = 5 + 6 + 7
- // 7 = 6 + 7 + 8
- // 8 = 7 + 8 + 9
- //
- // To avoid re-calculating PreProcess values over and over again we will do a
- // single column of 8 output values and store the second half of them
- // interleaved in |temp|. The first half is not stored, since it is used
- // immediately and becomes useless for the next column. Next we will start the
- // second column. When 2 rows have been calculated we can calculate Process
- // and output the results.
-
- // Calculate and store a single column. Scope so we can re-use the variable
- // names for the next step.
- uint16_t* ab_ptr = temp;
- const uint8_t* const src_pre_process = src - 2 * src_stride;
- // Calculate intermediate results, including two-pixel border, for example, if
- // unit size is 64x64, we calculate 68x68 pixels.
- {
- const uint8_t* column = src_pre_process - 4;
- __m128i row[5], row_sq[5];
- row[0] = row[1] = LoadLo8(column);
- column += src_stride;
- row[2] = LoadLo8(column);
- row_sq[0] = row_sq[1] = VmullLo8(row[1], row[1]);
- row_sq[2] = VmullLo8(row[2], row[2]);
-
- int y = (height + 2) >> 1;
- do {
- column += src_stride;
- row[3] = LoadLo8(column);
- column += src_stride;
- row[4] = LoadLo8(column);
- row_sq[3] = VmullLo8(row[3], row[3]);
- row_sq[4] = VmullLo8(row[4], row[4]);
- BoxFilterPreProcess4<5, 1>(row, row_sq, scale, ab_ptr);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0] = row_sq[2];
- row_sq[1] = row_sq[3];
- row_sq[2] = row_sq[4];
- ab_ptr += 8;
- } while (--y != 0);
-
- if ((height & 1) != 0) {
- column += src_stride;
- row[3] = row[4] = LoadLo8(column);
- row_sq[3] = row_sq[4] = VmullLo8(row[3], row[3]);
- BoxFilterPreProcess4<5, 1>(row, row_sq, scale, ab_ptr);
- }
- }
-
- const int w0 = restoration_info.sgr_proj_info.multiplier[0];
int x = 0;
do {
- // |src_pre_process| is X but we already processed the first column of 4
- // values so we want to start at Y and increment from there.
- // X s s s Y s s
- // s s s s s s s
- // s s i i i i i
- // s s i o o o o
- // s s i o o o o
+ __m128i ma[2], ma5[3], b[2][2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
+ sq, mas, bs);
+ Prepare3_8<0>(mas, ma5);
+ ma[1] = Sum565Lo(ma5);
+ Sum565W(bs, b[1]);
+ ma[0] = LoadAligned16(ma565);
+ LoadAligned32U32(b565, b[0]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
- // Seed the loop with one line of output. Then, inside the loop, for each
- // iteration we can output one even row and one odd row and carry the new
- // line to the next iteration. In the diagram below 'i' values are
- // intermediary values from the first step and '-' values are empty.
- // iiii
- // ---- > even row
- // iiii - odd row
- // ---- > even row
- // iiii
- __m128i ma[2], b[2], ma565[2], b565[2][2];
- ab_ptr = temp;
- ma[0] = b[0] = LoadAligned16(ab_ptr);
- const uint8_t* column = src_pre_process + x;
- __m128i row[5], row_sq[5][2];
- // Need |width| + 3 pixels, but we read max(|x|) + 16 pixels.
- // Mask max(|x|) + 13 - |width| extra pixels.
- row[0] = row[1] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[2] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[3] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[0][0] = row_sq[1][0] = VmullLo8(row[1], row[1]);
- row_sq[0][1] = row_sq[1][1] = VmullHi8(row[1], row[1]);
- row_sq[2][0] = VmullLo8(row[2], row[2]);
- row_sq[2][1] = VmullHi8(row[2], row[2]);
- row_sq[3][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = VmullHi8(row[3], row[3]);
- row_sq[4][0] = VmullLo8(row[4], row[4]);
- row_sq[4][1] = VmullHi8(row[4], row[4]);
- BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
-
- // Pass 1 Process. These are the only values we need to propagate between
- // rows.
- ma565[0] = Sum565(ma[0]);
- b565[0][0] = Sum565W(_mm_alignr_epi8(b[1], b[0], 8));
- b565[0][1] = Sum565W(b[1]);
- uint8_t* dst_ptr = dst + x;
- // Calculate one output line. Add in the line from the previous pass and
- // output one even row. Sum the new line and output the odd row. Carry the
- // new row into the next pass.
- for (int y = height >> 1; y != 0; --y) {
- ab_ptr += 8;
- ma[0] = b[0] = LoadAligned16(ab_ptr);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
- row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
- row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
- column += src_stride;
- row[3] = LoadUnaligned16Msan(column, x + 13 - width);
- column += src_stride;
- row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[3][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = VmullHi8(row[3], row[3]);
- row_sq[4][0] = VmullLo8(row[4], row[4]);
- row_sq[4][1] = VmullHi8(row[4], row[4]);
- BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
- const __m128i p0 = BoxFilterPass1(row[1], ma[0], b, ma565, b565);
- SelfGuidedSingleMultiplier(row[1], p0, w0, dst_ptr);
- dst_ptr += dst_stride;
- const __m128i p1 = CalculateFilteredOutput<4>(row[2], ma565[1], b565[1]);
- SelfGuidedSingleMultiplier(row[2], p1, w0, dst_ptr);
- dst_ptr += dst_stride;
- ma565[0] = ma565[1];
- b565[0][0] = b565[1][0], b565[0][1] = b565[1][1];
- }
-
- if ((height & 1) != 0) {
- ab_ptr += 8;
- ma[0] = b[0] = LoadAligned16(ab_ptr);
- row[0] = row[2];
- row[1] = row[3];
- row[2] = row[4];
- row_sq[0][0] = row_sq[2][0], row_sq[0][1] = row_sq[2][1];
- row_sq[1][0] = row_sq[3][0], row_sq[1][1] = row_sq[3][1];
- row_sq[2][0] = row_sq[4][0], row_sq[2][1] = row_sq[4][1];
- column += src_stride;
- row[3] = row[4] = LoadUnaligned16Msan(column, x + 13 - width);
- row_sq[3][0] = row_sq[4][0] = VmullLo8(row[3], row[3]);
- row_sq[3][1] = row_sq[4][1] = VmullHi8(row[3], row[3]);
- BoxFilterPreProcess8<5>(row, row_sq, scale, &ma[0], &b[1], ab_ptr);
- const __m128i p0 = BoxFilterPass1(row[1], ma[0], b, ma565, b565);
- SelfGuidedSingleMultiplier(row[1], p0, w0, dst_ptr);
- }
- x += 8;
+ ma[1] = Sum565Hi(ma5);
+ Sum565W(bs + 1, b[1]);
+ ma[0] = LoadAligned16(ma565 + 8);
+ LoadAligned32U32(b565 + 8, b[0]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ p = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ ma565 += 16;
+ b565 += 16;
+ x += 16;
} while (x < width);
}
-inline void BoxFilterProcessPass2(const uint8_t* src,
- const ptrdiff_t src_stride,
- const RestorationUnitInfo& restoration_info,
- const int width, const int height,
- const uint32_t scale, uint16_t* const temp,
- uint8_t* const dst,
- const ptrdiff_t dst_stride) {
- // Calculate intermediate results, including one-pixel border, for example, if
- // unit size is 64x64, we calculate 66x66 pixels.
- // Because of the vectors this calculates start in blocks of 4 so we actually
- // get 68 values.
- uint16_t* ab_ptr = temp;
- const uint8_t* const src_pre_process = src - 2 * src_stride;
- {
- const uint8_t* column = src_pre_process - 3;
- __m128i row[3], row_sq[3];
- row[0] = LoadLo8(column);
- column += src_stride;
- row[1] = LoadLo8(column);
- row_sq[0] = VmullLo8(row[0], row[0]);
- row_sq[1] = VmullLo8(row[1], row[1]);
- int y = height + 2;
- do {
- column += src_stride;
- row[2] = LoadLo8(column);
- row_sq[2] = VmullLo8(row[2], row[2]);
- BoxFilterPreProcess4<3, 0>(row, row_sq, scale, ab_ptr);
- row[0] = row[1];
- row[1] = row[2];
- row_sq[0] = row_sq[1];
- row_sq[1] = row_sq[2];
- ab_ptr += 8;
- } while (--y != 0);
+LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
+ uint16_t* const sum3[3], uint32_t* const square_sum3[3],
+ uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
+ uint32_t* const b444[2], uint8_t* const dst) {
+ __m128i s[2], mas[2], sq[4], bs[3];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
+
+ int x = 0;
+ do {
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass2 - width);
+ BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
+ bs);
+ __m128i ma[3], b[3][2], ma3[3];
+ Prepare3_8<0>(mas, ma3);
+ Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
+ b444[1]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x);
+ ma[1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[0]);
+ LoadAligned32U32(b444[0] + x, b[1]);
+ const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
+
+ Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
+ b343[2], b444[1]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1]);
+ const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
+ const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ mas[0] = mas[1];
+ bs[0] = bs[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilter(
+ const uint8_t* const src, const uint8_t* const src0,
+ const uint8_t* const src1, const ptrdiff_t stride, const int width,
+ const uint16_t scales[2], const int16_t w0, const int16_t w2,
+ uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ const ptrdiff_t sum_width, uint16_t* const ma343[4],
+ uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
+ uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
+ __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
+ s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
+ sq[0][0] = SquareLo8(s[0][0]);
+ sq[1][0] = SquareLo8(s[1][0]);
+ BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
+ ma3, b3, &ma5[0], &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
+ s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
+ sum_width, sq, ma3, b3, ma5, b5);
+ Prepare3_8<0>(ma3[0], ma3x[0]);
+ Prepare3_8<0>(ma3[1], ma3x[1]);
+ Prepare3_8<0>(ma5, ma5x);
+ Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
+ ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
+ b343[3], b444[2]);
+ ma[0][1] = Sum565Lo(ma5x);
+ StoreAligned16(ma565[1] + x, ma[0][1]);
+ Sum565W(b5, b[0][1]);
+ StoreAligned32U32(b565[1] + x, b[0][1]);
+ const __m128i sr0 = LoadAligned16(src + x);
+ const __m128i sr1 = LoadAligned16(src + stride + x);
+ const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x);
+ LoadAligned32U32(b565[0] + x, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x);
+ ma[1][1] = LoadAligned16(ma444[0] + x);
+ LoadAligned32U32(b343[0] + x, b[1][0]);
+ LoadAligned32U32(b444[0] + x, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
+ const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
+ ma[2][0] = LoadAligned16(ma343[1] + x);
+ LoadAligned32U32(b343[1] + x, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
+ const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
+
+ Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
+ b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
+ Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
+ ma444[2], b343[3], b444[2]);
+ ma[0][1] = Sum565Hi(ma5x);
+ StoreAligned16(ma565[1] + x + 8, ma[0][1]);
+ Sum565W(b5 + 1, b[0][1]);
+ StoreAligned32U32(b565[1] + x + 8, b[0][1]);
+ const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
+ const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
+ ma[0][0] = LoadAligned16(ma565[0] + x + 8);
+ LoadAligned32U32(b565[0] + x + 8, b[0][0]);
+ p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
+ p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
+ ma[1][0] = LoadAligned16(ma343[0] + x + 8);
+ ma[1][1] = LoadAligned16(ma444[0] + x + 8);
+ LoadAligned32U32(b343[0] + x + 8, b[1][0]);
+ LoadAligned32U32(b444[0] + x + 8, b[1][1]);
+ p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
+ const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
+ ma[2][0] = LoadAligned16(ma343[1] + x + 8);
+ LoadAligned32U32(b343[1] + x + 8, b[2][0]);
+ p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
+ const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
+ StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
+ s[0][0] = s[0][1];
+ s[1][0] = s[1][1];
+ sq[0][1] = sq[0][3];
+ sq[1][1] = sq[1][3];
+ ma3[0][0] = ma3[0][1];
+ ma3[1][0] = ma3[1][1];
+ ma5[0] = ma5[1];
+ b3[0][0] = b3[0][2];
+ b3[1][0] = b3[1][2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+inline void BoxFilterLastRow(
+ const uint8_t* const src, const uint8_t* const src0, const int width,
+ const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
+ const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
+ uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
+ uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
+ uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
+ uint8_t* const dst) {
+ __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
+ s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
+ sq[0] = SquareLo8(s[0]);
+ BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
+ square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
+ &b5[0]);
+
+ int x = 0;
+ do {
+ __m128i ma3x[3], ma5x[3], p[2];
+ s[1] = LoadUnaligned16Msan(src0 + x + 16,
+ x + 16 + kOverreadInBytesPass1 - width);
+ BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
+ square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
+ Prepare3_8<0>(ma3, ma3x);
+ Prepare3_8<0>(ma5, ma5x);
+ ma[1] = Sum565Lo(ma5x);
+ Sum565W(b5, b[1]);
+ ma[2] = Sum343Lo(ma3x);
+ Sum343W(b3, b[2]);
+ const __m128i sr = LoadAligned16(src + x);
+ const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565 + x);
+ LoadAligned32U32(b565 + x, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
+ ma[0] = LoadAligned16(ma343 + x);
+ ma[1] = LoadAligned16(ma444 + x);
+ LoadAligned32U32(b343 + x, b[0]);
+ LoadAligned32U32(b444 + x, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
+ const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
+
+ ma[1] = Sum565Hi(ma5x);
+ Sum565W(b5 + 1, b[1]);
+ ma[2] = Sum343Hi(ma3x);
+ Sum343W(b3 + 1, b[2]);
+ const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
+ ma[0] = LoadAligned16(ma565 + x + 8);
+ LoadAligned32U32(b565 + x + 8, b[0]);
+ p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
+ ma[0] = LoadAligned16(ma343 + x + 8);
+ ma[1] = LoadAligned16(ma444 + x + 8);
+ LoadAligned32U32(b343 + x + 8, b[0]);
+ LoadAligned32U32(b444 + x + 8, b[1]);
+ p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
+ const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
+ StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
+ s[0] = s[1];
+ sq[1] = sq[3];
+ ma3[0] = ma3[1];
+ ma5[0] = ma5[1];
+ b3[0] = b3[2];
+ b5[0] = b5[2];
+ x += 16;
+ } while (x < width);
+}
+
+LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
+ const RestorationUnitInfo& restoration_info, const uint8_t* src,
+ const ptrdiff_t stride, const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
+ uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
+ uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 3; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ b444[0] = sgr_buffer->b444;
+ for (int i = 1; i <= 2; ++i) {
+ ma444[i] = ma444[i - 1] + temp_stride;
+ b444[i] = b444[i - 1] + temp_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scales[0] != 0);
+ assert(scales[1] != 0);
+ BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
+ sum5[1], square_sum3[0], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
+ square_sum5, sum_width, ma343, ma444[0], ma565[0],
+ b343, b444[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
+ scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
+ ma343, ma444, ma565, b343, b444, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
}
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
+ square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
+ b444, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ Circulate4PointersBy2<uint16_t>(sum3);
+ Circulate4PointersBy2<uint32_t>(square_sum3);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ Circulate4PointersBy2<uint16_t>(ma343);
+ Circulate4PointersBy2<uint32_t>(b343);
+ std::swap(ma444[0], ma444[2]);
+ std::swap(b444[0], b444[2]);
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+ BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
+ sum_width, scales, w0, w2, sum3, sum5, square_sum3,
+ square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
+ b444[0], b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
+ const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
+ uint16_t *sum5[5], *ma565[2];
+ uint32_t *square_sum5[5], *b565[2];
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+ for (int i = 1; i <= 4; ++i) {
+ sum5[i] = sum5[i - 1] + sum_stride;
+ square_sum5[i] = square_sum5[i - 1] + sum_stride;
+ }
+ ma565[0] = sgr_buffer->ma565;
+ ma565[1] = ma565[0] + temp_stride;
+ b565[0] = sgr_buffer->b565;
+ b565[1] = b565[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum5[1], square_sum5[1]);
+ sum5[0] = sum5[1];
+ square_sum5[0] = square_sum5[1];
+ const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
+ BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
+ ma565[0], b565[0]);
+ sum5[0] = sgr_buffer->sum5;
+ square_sum5[0] = sgr_buffer->square_sum5;
+
+ for (int y = (height >> 1) - 1; y > 0; --y) {
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
+ square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ }
+
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ if ((height & 1) == 0 || height > 1) {
+ const uint8_t* sr[2];
+ if ((height & 1) == 0) {
+ sr[0] = bottom_border;
+ sr[1] = bottom_border + bottom_border_stride;
+ } else {
+ sr[0] = src + 2 * stride;
+ sr[1] = bottom_border;
+ }
+ BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
+ sum_width, scale, w0, ma565, b565, dst);
+ }
+ if ((height & 1) != 0) {
+ src += 3;
+ if (height > 1) {
+ src += 2 * stride;
+ dst += 2 * stride;
+ std::swap(ma565[0], ma565[1]);
+ std::swap(b565[0], b565[1]);
+ Circulate5PointersBy2<uint16_t>(sum5);
+ Circulate5PointersBy2<uint32_t>(square_sum5);
+ }
+ BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
+ sum_width, scale, w0, sum5, square_sum5, ma565[0],
+ b565[0], dst);
+ }
+}
+
+inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
+ const uint8_t* src, const ptrdiff_t stride,
+ const uint8_t* const top_border,
+ const ptrdiff_t top_border_stride,
+ const uint8_t* bottom_border,
+ const ptrdiff_t bottom_border_stride,
+ const int width, const int height,
+ SgrBuffer* const sgr_buffer, uint8_t* dst) {
assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
- const int w1 = restoration_info.sgr_proj_info.multiplier[1];
- const int w0 = (1 << kSgrProjPrecisionBits) - w1;
- int x = 0;
- do {
- ab_ptr = temp;
- __m128i ma, b[2], ma343[3], ma444[2], b343[3][2], b444[2][2];
- ma = b[0] = LoadAligned16(ab_ptr);
- const uint8_t* column = src_pre_process + x;
- __m128i row[3], row_sq[3][2];
- // Need |width| + 2 pixels, but we read max(|x|) + 16 pixels.
- // Mask max(|x|) + 14 - |width| extra pixels.
- row[0] = LoadUnaligned16Msan(column, x + 14 - width);
- column += src_stride;
- row[1] = LoadUnaligned16Msan(column, x + 14 - width);
- column += src_stride;
- row[2] = LoadUnaligned16Msan(column, x + 14 - width);
- row_sq[0][0] = VmullLo8(row[0], row[0]);
- row_sq[0][1] = VmullHi8(row[0], row[0]);
- row_sq[1][0] = VmullLo8(row[1], row[1]);
- row_sq[1][1] = VmullHi8(row[1], row[1]);
- row_sq[2][0] = VmullLo8(row[2], row[2]);
- row_sq[2][1] = VmullHi8(row[2], row[2]);
- BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
- ma343[0] = Sum343(ma);
- Sum343W(b, b343[0]);
- ab_ptr += 8;
- ma = b[0] = LoadAligned16(ab_ptr);
- row[0] = row[1];
- row[1] = row[2];
- row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
- row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
- column += src_stride;
- row[2] = LoadUnaligned16Msan(column, x + 14 - width);
- row_sq[2][0] = VmullLo8(row[2], row[2]);
- row_sq[2][1] = VmullHi8(row[2], row[2]);
- BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
- Sum343_444(ma, &ma343[1], &ma444[0]);
- Sum343_444W(b, b343[1], b444[0]);
+ const auto temp_stride = Align<ptrdiff_t>(width, 16);
+ const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
+ const auto sum_stride = temp_stride + 16;
+ const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
+ const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
+ const int sgr_proj_index = restoration_info.sgr_proj_info.index;
+ const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
+ uint16_t *sum3[3], *ma343[3], *ma444[2];
+ uint32_t *square_sum3[3], *b343[3], *b444[2];
+ sum3[0] = sgr_buffer->sum3;
+ square_sum3[0] = sgr_buffer->square_sum3;
+ ma343[0] = sgr_buffer->ma343;
+ b343[0] = sgr_buffer->b343;
+ for (int i = 1; i <= 2; ++i) {
+ sum3[i] = sum3[i - 1] + sum_stride;
+ square_sum3[i] = square_sum3[i - 1] + sum_stride;
+ ma343[i] = ma343[i - 1] + temp_stride;
+ b343[i] = b343[i - 1] + temp_stride;
+ }
+ ma444[0] = sgr_buffer->ma444;
+ ma444[1] = ma444[0] + temp_stride;
+ b444[0] = sgr_buffer->b444;
+ b444[1] = b444[0] + temp_stride;
+ assert(scale != 0);
+ BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
+ sum3[0], square_sum3[0]);
+ BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
+ sum_width, ma343[0], nullptr, b343[0],
+ nullptr);
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ const uint8_t* s;
+ if (height > 1) {
+ s = src + stride;
+ } else {
+ s = bottom_border;
+ bottom_border += bottom_border_stride;
+ }
+ BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
+ ma343[1], ma444[0], b343[1], b444[0]);
- uint8_t* dst_ptr = dst + x;
- int y = height;
- do {
- ab_ptr += 8;
- ma = b[0] = LoadAligned16(ab_ptr);
- row[0] = row[1];
- row[1] = row[2];
- row_sq[0][0] = row_sq[1][0], row_sq[0][1] = row_sq[1][1];
- row_sq[1][0] = row_sq[2][0], row_sq[1][1] = row_sq[2][1];
- column += src_stride;
- row[2] = LoadUnaligned16Msan(column, x + 14 - width);
- row_sq[2][0] = VmullLo8(row[2], row[2]);
- row_sq[2][1] = VmullHi8(row[2], row[2]);
- BoxFilterPreProcess8<3>(row, row_sq, scale, &ma, &b[1], ab_ptr);
- const __m128i p = BoxFilterPass2(row[0], ma, b, ma343, ma444, b343, b444);
- SelfGuidedSingleMultiplier(row[0], p, w0, dst_ptr);
- ma343[0] = ma343[1];
- ma343[1] = ma343[2];
- ma444[0] = ma444[1];
- b343[0][0] = b343[1][0], b343[0][1] = b343[1][1];
- b343[1][0] = b343[2][0], b343[1][1] = b343[2][1];
- b444[0][0] = b444[1][0], b444[0][1] = b444[1][1];
- dst_ptr += dst_stride;
- } while (--y != 0);
- x += 8;
- } while (x < width);
+ for (int y = height - 2; y > 0; --y) {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ }
+
+ int y = std::min(height, 2);
+ src += 2;
+ do {
+ Circulate3PointersBy1<uint16_t>(sum3);
+ Circulate3PointersBy1<uint32_t>(square_sum3);
+ BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
+ square_sum3, ma343, ma444, b343, b444, dst);
+ src += stride;
+ dst += stride;
+ bottom_border += bottom_border_stride;
+ Circulate3PointersBy1<uint16_t>(ma343);
+ Circulate3PointersBy1<uint32_t>(b343);
+ std::swap(ma444[0], ma444[1]);
+ std::swap(b444[0], b444[1]);
+ } while (--y != 0);
}
-// If |width| is non-multiple of 8, up to 7 more pixels are written to |dest| in
-// the end of each row. It is safe to overwrite the output as it will not be
+// If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
+// in the end of each row. It is safe to overwrite the output as it will not be
// part of the visible frame.
-void SelfGuidedFilter_SSE4_1(const void* const source, void* const dest,
- const RestorationUnitInfo& restoration_info,
- const ptrdiff_t source_stride,
- const ptrdiff_t dest_stride, const int width,
- const int height,
- RestorationBuffer* const buffer) {
+void SelfGuidedFilter_SSE4_1(
+ const RestorationUnitInfo& restoration_info, const void* const source,
+ const ptrdiff_t stride, const void* const top_border,
+ const ptrdiff_t top_border_stride, const void* const bottom_border,
+ const ptrdiff_t bottom_border_stride, const int width, const int height,
+ RestorationBuffer* const restoration_buffer, void* const dest) {
const int index = restoration_info.sgr_proj_info.index;
const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
- const auto* src = static_cast<const uint8_t*>(source);
- auto* dst = static_cast<uint8_t*>(dest);
+ const auto* const src = static_cast<const uint8_t*>(source);
+ const auto* top = static_cast<const uint8_t*>(top_border);
+ const auto* bottom = static_cast<const uint8_t*>(bottom_border);
+ auto* const dst = static_cast<uint8_t*>(dest);
+ SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
if (radius_pass_1 == 0) {
// |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
// following assertion.
assert(radius_pass_0 != 0);
- BoxFilterProcessPass1(src, source_stride, restoration_info, width, height,
- kSgrScaleParameter[index][0],
- buffer->sgr_buffer.temp_buffer, dst, dest_stride);
+ BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else if (radius_pass_0 == 0) {
- BoxFilterProcessPass2(src, source_stride, restoration_info, width, height,
- kSgrScaleParameter[index][1],
- buffer->sgr_buffer.temp_buffer, dst, dest_stride);
+ BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
+ top_border_stride, bottom - 2, bottom_border_stride,
+ width, height, sgr_buffer, dst);
} else {
- BoxFilterProcess(src, source_stride, restoration_info, width, height,
- kSgrScaleParameter[index], buffer->sgr_buffer.temp_buffer,
- dst, dest_stride);
+ BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
+ top_border_stride, bottom - 3, bottom_border_stride, width,
+ height, sgr_buffer, dst);
}
}
void Init8bpp() {
Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
assert(dsp != nullptr);
+ static_cast<void>(dsp);
#if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
dsp->loop_restorations[0] = WienerFilter_SSE4_1;
+#else
+ static_cast<void>(WienerFilter_SSE4_1);
#endif
#if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
+#else
+ static_cast<void>(SelfGuidedFilter_SSE4_1);
#endif
}
@@ -1789,7 +2565,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -1797,4 +2573,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/loop_restoration_sse4.h b/libgav1/src/dsp/x86/loop_restoration_sse4.h
index e11f35a..00df3af 100644
--- a/libgav1/src/dsp/x86/loop_restoration_sse4.h
+++ b/libgav1/src/dsp/x86/loop_restoration_sse4.h
@@ -24,15 +24,16 @@
namespace dsp {
// Initializes Dsp::loop_restorations, see the defines below for specifics.
-// This function is not thread-safe.
+// These functions are not thread-safe.
void LoopRestorationInit_SSE4_1();
+void LoopRestorationInit10bpp_SSE4_1();
} // namespace dsp
} // namespace libgav1
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_WienerFilter
#define LIBGAV1_Dsp8bpp_WienerFilter LIBGAV1_CPU_SSE4_1
@@ -42,6 +43,14 @@
#define LIBGAV1_Dsp8bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_WienerFilter
+#define LIBGAV1_Dsp10bpp_WienerFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
+#define LIBGAV1_Dsp10bpp_SelfGuidedFilter LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_LOOP_RESTORATION_SSE4_H_
diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.cc b/libgav1/src/dsp/x86/mask_blend_sse4.cc
index 76d3811..2e836af 100644
--- a/libgav1/src/dsp/x86/mask_blend_sse4.cc
+++ b/libgav1/src/dsp/x86/mask_blend_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/mask_blend.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -121,10 +121,8 @@
const __m128i pred_mask_0,
const __m128i pred_mask_1, uint8_t* dst,
const ptrdiff_t dst_stride) {
- const __m128i pred_val_0_lo = LoadLo8(pred_0);
- const __m128i pred_val_0 = LoadHi8(pred_val_0_lo, pred_0 + 4);
- const __m128i pred_val_1_lo = LoadLo8(pred_1);
- const __m128i pred_val_1 = LoadHi8(pred_val_1_lo, pred_1 + 4);
+ const __m128i pred_val_0 = LoadAligned16(pred_0);
+ const __m128i pred_val_1 = LoadAligned16(pred_1);
const __m128i mask_lo = _mm_unpacklo_epi16(pred_mask_0, pred_mask_1);
const __m128i mask_hi = _mm_unpackhi_epi16(pred_mask_0, pred_mask_1);
const __m128i pred_lo = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
@@ -286,8 +284,7 @@
const __m128i pred_mask_1) {
const __m128i pred_mask = _mm_unpacklo_epi8(pred_mask_0, pred_mask_1);
- __m128i pred_val_0 = Load4(pred_0);
- pred_val_0 = _mm_or_si128(_mm_slli_si128(Load4(pred_0 + 4), 4), pred_val_0);
+ const __m128i pred_val_0 = LoadLo8(pred_0);
// TODO(b/150326556): One load.
__m128i pred_val_1 = Load4(pred_1);
pred_val_1 = _mm_or_si128(_mm_slli_si128(Load4(pred_1 + pred_stride_1), 4),
@@ -433,12 +430,515 @@
} // namespace
} // namespace low_bitdepth
-void MaskBlendInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kMax10bppSample = (1 << 10) - 1;
+constexpr int kMaskInverse = 64;
+constexpr int kRoundBitsMaskBlend = 4;
+
+inline __m128i RightShiftWithRoundingZero_U16(const __m128i v_val_d, int bits,
+ const __m128i zero) {
+ // Shift out all but the last bit.
+ const __m128i v_tmp_d = _mm_srli_epi16(v_val_d, bits - 1);
+ // Avg with zero will shift by 1 and round.
+ return _mm_avg_epu16(v_tmp_d, zero);
+}
+
+inline __m128i RightShiftWithRoundingConst_S32(const __m128i v_val_d, int bits,
+ const __m128i shift) {
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, shift);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask4x2(const uint8_t* mask, ptrdiff_t mask_stride,
+ const __m128i zero) {
+ if (subsampling_x == 1) {
+ if (subsampling_y == 0) {
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(LoadLo8(mask));
+ const __m128i mask_val_1 =
+ _mm_cvtepu8_epi16(LoadLo8(mask + (mask_stride << subsampling_y)));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+ }
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 =
+ LoadHi8(LoadLo8(mask), mask + (mask_stride << 1));
+ const __m128i mask_val_1 = LoadHi8(LoadLo8(mask + mask_stride),
+ mask + (mask_stride << 1) + mask_stride);
+ const __m128i add = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i subsampled_mask = _mm_maddubs_epi16(add, one);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 2, zero);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val_0 = Load4(mask);
+ const __m128i mask_val_1 = Load4(mask + mask_stride);
+ return _mm_cvtepu8_epi16(
+ _mm_or_si128(mask_val_0, _mm_slli_si128(mask_val_1, 4)));
+}
+
+template <int subsampling_x, int subsampling_y>
+inline __m128i GetMask8(const uint8_t* mask, const ptrdiff_t stride,
+ const __m128i zero) {
+ if (subsampling_x == 1) {
+ if (subsampling_y == 0) {
+ const __m128i row_vals = LoadUnaligned16(mask);
+ const __m128i mask_val_0 = _mm_cvtepu8_epi16(row_vals);
+ const __m128i mask_val_1 = _mm_cvtepu8_epi16(_mm_srli_si128(row_vals, 8));
+ __m128i subsampled_mask = _mm_hadd_epi16(mask_val_0, mask_val_1);
+ return RightShiftWithRoundingZero_U16(subsampled_mask, 1, zero);
+ }
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i mask_val_0 = LoadUnaligned16(mask);
+ const __m128i mask_val_1 = LoadUnaligned16(mask + stride);
+ const __m128i add_0 = _mm_adds_epu8(mask_val_0, mask_val_1);
+ const __m128i mask_0 = _mm_maddubs_epi16(add_0, one);
+ return RightShiftWithRoundingZero_U16(mask_0, 2, zero);
+ }
+ assert(subsampling_y == 0 && subsampling_x == 0);
+ const __m128i mask_val = LoadLo8(mask);
+ return _mm_cvtepu8_epi16(mask_val);
+}
+
+inline void WriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+ const __m128i& pred_mask_1, const __m128i& offset, const __m128i& max,
+ const __m128i& shift4, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0);
+ const __m128i pred_val_1 = LoadHi8(LoadLo8(pred_1), pred_1 + pred_stride_1);
+
+ // int res = (mask_value * pred_0[x] + (64 - mask_value) * pred_1[x]) >> 6;
+ const __m128i compound_pred_lo_0 = _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 = _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 = _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 = _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+ // res -= (bitdepth == 8) ? 0 : kCompoundOffset;
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+
+ // dst[x] = static_cast<Pixel>(
+ // Clip3(RightShiftWithRounding(res, inter_post_round_bits), 0,
+ // (1 << kBitdepth8) - 1));
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result = _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreLo8(dst, result);
+ StoreHi8(dst + dst_stride, result);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4x4_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1, pred_mask_0,
+ pred_mask_1, offset, max, shift4, dst,
+ dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ MaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ WriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, offset, max,
+ shift4, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void MaskBlend10bpp_SSE4_1(const void* prediction_0,
+ const void* prediction_1,
+ const ptrdiff_t prediction_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width,
+ const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ MaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ const __m128i offset = _mm_set1_epi32(kCompoundOffset);
+ const __m128i max = _mm_set1_epi16(kMax10bppSample);
+ const __m128i shift4 = _mm_set1_epi32((1 << kRoundBitsMaskBlend) >> 1);
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride, zero);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+
+ const __m128i compound_pred_lo_0 =
+ _mm_mullo_epi16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_hi_0 =
+ _mm_mulhi_epu16(pred_val_0, pred_mask_0);
+ const __m128i compound_pred_lo_1 =
+ _mm_mullo_epi16(pred_val_1, pred_mask_1);
+ const __m128i compound_pred_hi_1 =
+ _mm_mulhi_epu16(pred_val_1, pred_mask_1);
+ const __m128i pack0_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack0_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_0, compound_pred_hi_0);
+ const __m128i pack1_lo =
+ _mm_unpacklo_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i pack1_hi =
+ _mm_unpackhi_epi16(compound_pred_lo_1, compound_pred_hi_1);
+ const __m128i compound_pred_lo = _mm_add_epi32(pack0_lo, pack1_lo);
+ const __m128i compound_pred_hi = _mm_add_epi32(pack0_hi, pack1_hi);
+
+ const __m128i sub_0 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_lo, 6), offset);
+ const __m128i sub_1 =
+ _mm_sub_epi32(_mm_srli_epi32(compound_pred_hi, 6), offset);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(sub_0, kRoundBitsMaskBlend, shift4);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(sub_1, kRoundBitsMaskBlend, shift4);
+ const __m128i result =
+ _mm_min_epi16(_mm_packus_epi32(shift_0, shift_1), max);
+ StoreUnaligned16(dst + x, result);
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+
+inline void InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(
+ const uint16_t* prediction_0, const uint16_t* prediction_1,
+ const ptrdiff_t pred_stride_1, const __m128i& pred_mask_0,
+ const __m128i& pred_mask_1, const __m128i& shift6, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const __m128i pred_val_0 = LoadUnaligned16(prediction_0);
+ const __m128i pred_val_1 =
+ LoadHi8(LoadLo8(prediction_1), prediction_1 + pred_stride_1);
+
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ const __m128i res = _mm_packus_epi32(shift_0, shift_1);
+ StoreLo8(dst, res);
+ StoreHi8(dst + dst_stride, res);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4x4_SSE4_1(
+ const uint16_t* pred_0, const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1, const uint8_t* mask,
+ const ptrdiff_t mask_stride, uint16_t* dst, const ptrdiff_t dst_stride) {
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+ pred_0 += 4 << 1;
+ pred_1 += pred_stride_1 << 1;
+ mask += mask_stride << (1 + subsampling_y);
+ dst += dst_stride << 1;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1, shift6,
+ dst, dst_stride);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp4xH_SSE4_1(const uint16_t* pred_0,
+ const uint16_t* pred_1,
+ const ptrdiff_t pred_stride_1,
+ const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride,
+ const int height, uint16_t* dst,
+ const ptrdiff_t dst_stride) {
+ const uint8_t* mask = mask_ptr;
+ if (height == 4) {
+ InterIntraMaskBlend10bpp4x4_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask, mask_stride, dst, dst_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const uint8_t pred0_stride2 = 4 << 1;
+ const ptrdiff_t pred1_stride2 = pred_stride_1 << 1;
+ const ptrdiff_t mask_stride2 = mask_stride << (1 + subsampling_y);
+ const ptrdiff_t dst_stride2 = dst_stride << 1;
+ int y = height;
+ do {
+ __m128i pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+
+ pred_mask_0 =
+ GetMask4x2<subsampling_x, subsampling_y>(mask, mask_stride, zero);
+ pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ InterIntraWriteMaskBlendLine10bpp4x2_SSE4_1(pred_0, pred_1, pred_stride_1,
+ pred_mask_0, pred_mask_1,
+ shift6, dst, dst_stride);
+ pred_0 += pred0_stride2;
+ pred_1 += pred1_stride2;
+ mask += mask_stride2;
+ dst += dst_stride2;
+ y -= 8;
+ } while (y != 0);
+}
+
+template <int subsampling_x, int subsampling_y>
+inline void InterIntraMaskBlend10bpp_SSE4_1(
+ const void* prediction_0, const void* prediction_1,
+ const ptrdiff_t prediction_stride_1, const uint8_t* const mask_ptr,
+ const ptrdiff_t mask_stride, const int width, const int height, void* dest,
+ const ptrdiff_t dest_stride) {
+ auto* dst = static_cast<uint16_t*>(dest);
+ const ptrdiff_t dst_stride = dest_stride / sizeof(dst[0]);
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ const ptrdiff_t pred_stride_0 = width;
+ const ptrdiff_t pred_stride_1 = prediction_stride_1;
+ if (width == 4) {
+ InterIntraMaskBlend10bpp4xH_SSE4_1<subsampling_x, subsampling_y>(
+ pred_0, pred_1, pred_stride_1, mask_ptr, mask_stride, height, dst,
+ dst_stride);
+ return;
+ }
+ const uint8_t* mask = mask_ptr;
+ const __m128i mask_inverter = _mm_set1_epi16(kMaskInverse);
+ const __m128i shift6 = _mm_set1_epi32((1 << 6) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const ptrdiff_t mask_stride_ss = mask_stride << subsampling_y;
+ int y = height;
+ do {
+ int x = 0;
+ do {
+ const __m128i pred_mask_0 = GetMask8<subsampling_x, subsampling_y>(
+ mask + (x << subsampling_x), mask_stride, zero);
+ const __m128i pred_val_0 = LoadUnaligned16(pred_0 + x);
+ const __m128i pred_val_1 = LoadUnaligned16(pred_1 + x);
+ // 64 - mask
+ const __m128i pred_mask_1 = _mm_sub_epi16(mask_inverter, pred_mask_0);
+ const __m128i mask_0 = _mm_unpacklo_epi16(pred_mask_1, pred_mask_0);
+ const __m128i mask_1 = _mm_unpackhi_epi16(pred_mask_1, pred_mask_0);
+ const __m128i pred_0 = _mm_unpacklo_epi16(pred_val_0, pred_val_1);
+ const __m128i pred_1 = _mm_unpackhi_epi16(pred_val_0, pred_val_1);
+
+ const __m128i compound_pred_0 = _mm_madd_epi16(pred_0, mask_0);
+ const __m128i compound_pred_1 = _mm_madd_epi16(pred_1, mask_1);
+ const __m128i shift_0 =
+ RightShiftWithRoundingConst_S32(compound_pred_0, 6, shift6);
+ const __m128i shift_1 =
+ RightShiftWithRoundingConst_S32(compound_pred_1, 6, shift6);
+ StoreUnaligned16(dst + x, _mm_packus_epi32(shift_0, shift_1));
+ x += 8;
+ } while (x < width);
+ dst += dst_stride;
+ pred_0 += pred_stride_0;
+ pred_1 += pred_stride_1;
+ mask += mask_stride_ss;
+ } while (--y != 0);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend444)
+ dsp->mask_blend[0][0] = MaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend422)
+ dsp->mask_blend[1][0] = MaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlend420)
+ dsp->mask_blend[2][0] = MaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra444)
+ dsp->mask_blend[0][1] = InterIntraMaskBlend10bpp_SSE4_1<0, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra422)
+ dsp->mask_blend[1][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 0>;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(MaskBlendInterIntra420)
+ dsp->mask_blend[2][1] = InterIntraMaskBlend10bpp_SSE4_1<1, 1>;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void MaskBlendInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -447,4 +947,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/mask_blend_sse4.h b/libgav1/src/dsp/x86/mask_blend_sse4.h
index cfd5e9a..4a95f0c 100644
--- a/libgav1/src/dsp/x86/mask_blend_sse4.h
+++ b/libgav1/src/dsp/x86/mask_blend_sse4.h
@@ -29,13 +29,56 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend444
#define LIBGAV1_Dsp8bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend422
#define LIBGAV1_Dsp8bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_MaskBlend420
#define LIBGAV1_Dsp8bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444
#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422
#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420
#define LIBGAV1_Dsp8bpp_InterIntraMaskBlend8bpp420 LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend444
+#define LIBGAV1_Dsp10bpp_MaskBlend444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend422
+#define LIBGAV1_Dsp10bpp_MaskBlend422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlend420
+#define LIBGAV1_Dsp10bpp_MaskBlend420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra444
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra444 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra422
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra422 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_MaskBlendInterIntra420
+#define LIBGAV1_Dsp10bpp_MaskBlendInterIntra420 LIBGAV1_CPU_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_MASK_BLEND_SSE4_H_
diff --git a/libgav1/src/dsp/x86/motion_field_projection_sse4.cc b/libgav1/src/dsp/x86/motion_field_projection_sse4.cc
index 1875198..e3f2cce 100644
--- a/libgav1/src/dsp/x86/motion_field_projection_sse4.cc
+++ b/libgav1/src/dsp/x86/motion_field_projection_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/motion_field_projection.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -139,9 +139,9 @@
const ptrdiff_t offset =
static_cast<int16_t>(_mm_extract_epi16(position, idx));
if ((idx & 3) == 0) {
- dst_mv[offset].mv32 = _mm_cvtsi128_si32(mv);
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_cvtsi128_si32(mv));
} else {
- dst_mv[offset].mv32 = _mm_extract_epi32(mv, idx & 3);
+ dst_mv[offset].mv32 = static_cast<uint32_t>(_mm_extract_epi32(mv, idx & 3));
}
dst_reference_offset[offset] = _mm_extract_epi8(reference_offset, idx);
}
@@ -386,7 +386,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -394,4 +394,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/motion_field_projection_sse4.h b/libgav1/src/dsp/x86/motion_field_projection_sse4.h
index 7828de5..c05422c 100644
--- a/libgav1/src/dsp/x86/motion_field_projection_sse4.h
+++ b/libgav1/src/dsp/x86/motion_field_projection_sse4.h
@@ -30,8 +30,12 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel
#define LIBGAV1_Dsp8bpp_MotionFieldProjectionKernel LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_MOTION_FIELD_PROJECTION_SSE4_H_
diff --git a/libgav1/src/dsp/x86/motion_vector_search_sse4.cc b/libgav1/src/dsp/x86/motion_vector_search_sse4.cc
index e49be12..7f5f035 100644
--- a/libgav1/src/dsp/x86/motion_vector_search_sse4.cc
+++ b/libgav1/src/dsp/x86/motion_vector_search_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/motion_vector_search.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -251,7 +251,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -259,4 +259,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/motion_vector_search_sse4.h b/libgav1/src/dsp/x86/motion_vector_search_sse4.h
index b8b0412..d65b392 100644
--- a/libgav1/src/dsp/x86/motion_vector_search_sse4.h
+++ b/libgav1/src/dsp/x86/motion_vector_search_sse4.h
@@ -30,8 +30,12 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_MotionVectorSearch
#define LIBGAV1_Dsp8bpp_MotionVectorSearch LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_MOTION_VECTOR_SEARCH_SSE4_H_
diff --git a/libgav1/src/dsp/x86/obmc_sse4.cc b/libgav1/src/dsp/x86/obmc_sse4.cc
index a1be5ef..c34a7f7 100644
--- a/libgav1/src/dsp/x86/obmc_sse4.cc
+++ b/libgav1/src/dsp/x86/obmc_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/obmc.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <xmmintrin.h>
@@ -31,6 +31,7 @@
namespace libgav1 {
namespace dsp {
+namespace low_bitdepth {
namespace {
#include "src/dsp/obmc.inc"
@@ -311,13 +312,295 @@
}
} // namespace
+} // namespace low_bitdepth
-void ObmcInit_SSE4_1() { Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+#include "src/dsp/obmc.inc"
+
+constexpr int kRoundBitsObmcBlend = 6;
+
+inline void OverlapBlendFromLeft2xH_SSE4_1(
+ uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+ const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = _mm_shufflelo_epi16(Load2(kObmcMask), 0x00);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = Load4x2(pred, pred + pred_stride);
+ const __m128i obmc_pred_val =
+ Load4x2(obmc_pred, obmc_pred + obmc_pred_stride);
+ const __m128i terms = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i result = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result, result);
+ Store4(pred, packed_result);
+ Store4(pred + pred_stride, _mm_srli_si128(packed_result, 4));
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+inline void OverlapBlendFromLeft4xH_SSE4_1(
+ uint16_t* const prediction, const ptrdiff_t pred_stride, const int height,
+ const uint16_t* const obmc_prediction, const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ const __m128i mask_inverter = _mm_cvtsi32_si128(0x40404040);
+ const __m128i mask_val = Load4(kObmcMask + 2);
+ // 64 - mask.
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks =
+ _mm_cvtepi8_epi16(_mm_unpacklo_epi8(mask_val, obmc_mask_val));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y -= 2;
+ } while (y != 0);
+}
+
+void OverlapBlendFromLeft10bpp_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+ if (width == 2) {
+ OverlapBlendFromLeft2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromLeft4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const uint8_t* mask = kObmcMask + width - 2;
+ int x = 0;
+ do {
+ pred = static_cast<uint16_t*>(prediction) + x;
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction) + x;
+ const __m128i mask_val = LoadLo8(mask + x);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int y = height;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred, _mm_packus_epi32(result_lo, result_hi));
+
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (--y != 0);
+ x += 8;
+ } while (x < width);
+}
+
+inline void OverlapBlendFromTop2xH_SSE4_1(uint16_t* const prediction,
+ const ptrdiff_t pred_stride,
+ const int height,
+ const uint16_t* const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height =
+ height - (height >> 2); // compute_height based on 8-bit opt
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+ Store4(pred, packed_result);
+ Store4(pred + pred_stride, _mm_srli_si128(packed_result, 8));
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y += 2;
+ } while (y < compute_height);
+}
+
+inline void OverlapBlendFromTop4xH_SSE4_1(uint16_t* const prediction,
+ const ptrdiff_t pred_stride,
+ const int height,
+ const uint16_t* const obmc_prediction,
+ const ptrdiff_t obmc_pred_stride) {
+ uint16_t* pred = prediction;
+ const uint16_t* obmc_pred = obmc_prediction;
+ const __m128i mask_inverter = _mm_set1_epi16(64);
+ const __m128i mask_shuffler = _mm_set_epi32(0x01010101, 0x01010101, 0, 0);
+ const __m128i mask_preinverter = _mm_set1_epi16(-256 | 1);
+ const uint8_t* mask = kObmcMask + height - 2;
+ const int compute_height = height - (height >> 2);
+ const ptrdiff_t pred_stride2 = pred_stride << 1;
+ const ptrdiff_t obmc_pred_stride2 = obmc_pred_stride << 1;
+ int y = 0;
+ do {
+ // First mask in the first half, second mask in the second half.
+ const __m128i mask_val = _mm_shuffle_epi8(Load4(mask + y), mask_shuffler);
+ const __m128i masks =
+ _mm_sub_epi8(mask_inverter, _mm_sign_epi8(mask_val, mask_preinverter));
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+
+ const __m128i pred_val = LoadHi8(LoadLo8(pred), pred + pred_stride);
+ const __m128i obmc_pred_val =
+ LoadHi8(LoadLo8(obmc_pred), obmc_pred + obmc_pred_stride);
+ const __m128i terms_lo = _mm_unpacklo_epi16(obmc_pred_val, pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(obmc_pred_val, pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ const __m128i packed_result = _mm_packus_epi32(result_lo, result_hi);
+
+ StoreLo8(pred, packed_result);
+ StoreHi8(pred + pred_stride, packed_result);
+ pred += pred_stride2;
+ obmc_pred += obmc_pred_stride2;
+ y += 2;
+ } while (y < compute_height);
+}
+
+void OverlapBlendFromTop10bpp_SSE4_1(void* const prediction,
+ const ptrdiff_t prediction_stride,
+ const int width, const int height,
+ const void* const obmc_prediction,
+ const ptrdiff_t obmc_prediction_stride) {
+ auto* pred = static_cast<uint16_t*>(prediction);
+ const auto* obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ const ptrdiff_t pred_stride = prediction_stride / sizeof(pred[0]);
+ const ptrdiff_t obmc_pred_stride =
+ obmc_prediction_stride / sizeof(obmc_pred[0]);
+
+ if (width == 2) {
+ OverlapBlendFromTop2xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+ if (width == 4) {
+ OverlapBlendFromTop4xH_SSE4_1(pred, pred_stride, height, obmc_pred,
+ obmc_pred_stride);
+ return;
+ }
+
+ const __m128i mask_inverter = _mm_set1_epi8(64);
+ const int compute_height = height - (height >> 2);
+ const uint8_t* mask = kObmcMask + height - 2;
+ pred = static_cast<uint16_t*>(prediction);
+ obmc_pred = static_cast<const uint16_t*>(obmc_prediction);
+ int y = 0;
+ do {
+ const __m128i mask_val = _mm_set1_epi8(mask[y]);
+ // 64 - mask
+ const __m128i obmc_mask_val = _mm_sub_epi8(mask_inverter, mask_val);
+ const __m128i masks = _mm_unpacklo_epi8(mask_val, obmc_mask_val);
+ const __m128i masks_lo = _mm_cvtepi8_epi16(masks);
+ const __m128i masks_hi = _mm_cvtepi8_epi16(_mm_srli_si128(masks, 8));
+ int x = 0;
+ do {
+ const __m128i pred_val = LoadUnaligned16(pred + x);
+ const __m128i obmc_pred_val = LoadUnaligned16(obmc_pred + x);
+ const __m128i terms_lo = _mm_unpacklo_epi16(pred_val, obmc_pred_val);
+ const __m128i terms_hi = _mm_unpackhi_epi16(pred_val, obmc_pred_val);
+ const __m128i result_lo = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_lo, masks_lo), kRoundBitsObmcBlend);
+ const __m128i result_hi = RightShiftWithRounding_U32(
+ _mm_madd_epi16(terms_hi, masks_hi), kRoundBitsObmcBlend);
+ StoreUnaligned16(pred + x, _mm_packus_epi32(result_lo, result_hi));
+ x += 8;
+ } while (x < width);
+ pred += pred_stride;
+ obmc_pred += obmc_pred_stride;
+ } while (++y < compute_height);
+}
+
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcVertical)
+ dsp->obmc_blend[kObmcDirectionVertical] = OverlapBlendFromTop10bpp_SSE4_1;
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(ObmcHorizontal)
+ dsp->obmc_blend[kObmcDirectionHorizontal] = OverlapBlendFromLeft10bpp_SSE4_1;
+#endif
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void ObmcInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -326,4 +609,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/obmc_sse4.h b/libgav1/src/dsp/x86/obmc_sse4.h
index 03669ad..448d2cf 100644
--- a/libgav1/src/dsp/x86/obmc_sse4.h
+++ b/libgav1/src/dsp/x86/obmc_sse4.h
@@ -31,13 +31,19 @@
// If sse4 is enabled and the baseline isn't set due to a higher level of
// optimization being enabled, signal the sse4 implementation should be used.
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#ifndef LIBGAV1_Dsp8bpp_ObmcVertical
#define LIBGAV1_Dsp8bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
#endif
#ifndef LIBGAV1_Dsp8bpp_ObmcHorizontal
#define LIBGAV1_Dsp8bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
#endif
-#endif // LIBGAV1_ENABLE_SSE4_1
+#ifndef LIBGAV1_Dsp10bpp_ObmcVertical
+#define LIBGAV1_Dsp10bpp_ObmcVertical LIBGAV1_CPU_SSE4_1
+#endif
+#ifndef LIBGAV1_Dsp10bpp_ObmcHorizontal
+#define LIBGAV1_Dsp10bpp_ObmcHorizontal LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_OBMC_SSE4_H_
diff --git a/libgav1/src/dsp/x86/super_res_sse4.cc b/libgav1/src/dsp/x86/super_res_sse4.cc
index 050bcc4..85d05bc 100644
--- a/libgav1/src/dsp/x86/super_res_sse4.cc
+++ b/libgav1/src/dsp/x86/super_res_sse4.cc
@@ -15,13 +15,15 @@
#include "src/dsp/super_res.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/dsp/x86/common_sse4.h"
+#include "src/dsp/x86/transpose_sse4.h"
+#include "src/utils/common.h"
#include "src/utils/constants.h"
namespace libgav1 {
@@ -30,12 +32,153 @@
namespace {
// Upscale_Filter as defined in AV1 Section 7.16
+// Negative to make them fit in 8-bit.
+alignas(16) const int8_t
+ kNegativeUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
+ {0, 0, 0, -128, 0, 0, 0, 0}, {0, 0, 1, -128, -2, 1, 0, 0},
+ {0, -1, 3, -127, -4, 2, -1, 0}, {0, -1, 4, -127, -6, 3, -1, 0},
+ {0, -2, 6, -126, -8, 3, -1, 0}, {0, -2, 7, -125, -11, 4, -1, 0},
+ {1, -2, 8, -125, -13, 5, -2, 0}, {1, -3, 9, -124, -15, 6, -2, 0},
+ {1, -3, 10, -123, -18, 6, -2, 1}, {1, -3, 11, -122, -20, 7, -3, 1},
+ {1, -4, 12, -121, -22, 8, -3, 1}, {1, -4, 13, -120, -25, 9, -3, 1},
+ {1, -4, 14, -118, -28, 9, -3, 1}, {1, -4, 15, -117, -30, 10, -4, 1},
+ {1, -5, 16, -116, -32, 11, -4, 1}, {1, -5, 16, -114, -35, 12, -4, 1},
+ {1, -5, 17, -112, -38, 12, -4, 1}, {1, -5, 18, -111, -40, 13, -5, 1},
+ {1, -5, 18, -109, -43, 14, -5, 1}, {1, -6, 19, -107, -45, 14, -5, 1},
+ {1, -6, 19, -105, -48, 15, -5, 1}, {1, -6, 19, -103, -51, 16, -5, 1},
+ {1, -6, 20, -101, -53, 16, -6, 1}, {1, -6, 20, -99, -56, 17, -6, 1},
+ {1, -6, 20, -97, -58, 17, -6, 1}, {1, -6, 20, -95, -61, 18, -6, 1},
+ {2, -7, 20, -93, -64, 18, -6, 2}, {2, -7, 20, -91, -66, 19, -6, 1},
+ {2, -7, 20, -88, -69, 19, -6, 1}, {2, -7, 20, -86, -71, 19, -6, 1},
+ {2, -7, 20, -84, -74, 20, -7, 2}, {2, -7, 20, -81, -76, 20, -7, 1},
+ {2, -7, 20, -79, -79, 20, -7, 2}, {1, -7, 20, -76, -81, 20, -7, 2},
+ {2, -7, 20, -74, -84, 20, -7, 2}, {1, -6, 19, -71, -86, 20, -7, 2},
+ {1, -6, 19, -69, -88, 20, -7, 2}, {1, -6, 19, -66, -91, 20, -7, 2},
+ {2, -6, 18, -64, -93, 20, -7, 2}, {1, -6, 18, -61, -95, 20, -6, 1},
+ {1, -6, 17, -58, -97, 20, -6, 1}, {1, -6, 17, -56, -99, 20, -6, 1},
+ {1, -6, 16, -53, -101, 20, -6, 1}, {1, -5, 16, -51, -103, 19, -6, 1},
+ {1, -5, 15, -48, -105, 19, -6, 1}, {1, -5, 14, -45, -107, 19, -6, 1},
+ {1, -5, 14, -43, -109, 18, -5, 1}, {1, -5, 13, -40, -111, 18, -5, 1},
+ {1, -4, 12, -38, -112, 17, -5, 1}, {1, -4, 12, -35, -114, 16, -5, 1},
+ {1, -4, 11, -32, -116, 16, -5, 1}, {1, -4, 10, -30, -117, 15, -4, 1},
+ {1, -3, 9, -28, -118, 14, -4, 1}, {1, -3, 9, -25, -120, 13, -4, 1},
+ {1, -3, 8, -22, -121, 12, -4, 1}, {1, -3, 7, -20, -122, 11, -3, 1},
+ {1, -2, 6, -18, -123, 10, -3, 1}, {0, -2, 6, -15, -124, 9, -3, 1},
+ {0, -2, 5, -13, -125, 8, -2, 1}, {0, -1, 4, -11, -125, 7, -2, 0},
+ {0, -1, 3, -8, -126, 6, -2, 0}, {0, -1, 3, -6, -127, 4, -1, 0},
+ {0, -1, 2, -4, -127, 3, -1, 0}, {0, 0, 1, -2, -128, 1, 0, 0},
+};
+
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint8_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 16) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadLo8(kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ remainder = subpixel_x & kSuperResScaleMask;
+ filter = LoadHi8(filter,
+ kNegativeUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
+}
+
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint8_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint8_t*>(coefficients);
+ uint8_t* dst_ptr = dst;
+ ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalBorder);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 15 extra upscaled pixels which will
+ // over-read up to 15 downscaled pixels in the end of each row.
+ // kSuperResHorizontalPadding protects this behavior from segmentation
+ // faults and threading issues.
+ int x = RightShiftWithCeiling(upscaled_width, 4);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 16) {
+ // TODO(b/178652672): Remove Msan loads when hadd bug is resolved.
+ // It's fine to write uninitialized bytes outside the frame, but the
+ // inside-frame pixels are incorrectly labeled uninitialized if
+ // uninitialized values go through the hadd intrinsics.
+ // |src| is offset 4 pixels to the left, and there are 4 extended border
+ // pixels, so a difference of 0 from |downscaled_width| indicates 8 good
+ // bytes. A difference of 1 indicates 7 good bytes.
+ const int msan_bytes_lo =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ __m128i s =
+ LoadLo8Msan(&src[subpixel_x >> kSuperResScaleBits], msan_bytes_lo);
+ subpixel_x += step;
+ const int msan_bytes_hi =
+ (subpixel_x >> kSuperResScaleBits) - downscaled_width;
+ s = LoadHi8Msan(s, &src[subpixel_x >> kSuperResScaleBits],
+ msan_bytes_hi);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_maddubs_epi16(s, f);
+ }
+
+ __m128i a[4];
+ a[0] = _mm_hadd_epi16(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi16(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi16(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi16(weighted_src[6], weighted_src[7]);
+ Transpose2x16_U16(a, a);
+ a[0] = _mm_adds_epi16(a[0], a[1]);
+ a[1] = _mm_adds_epi16(a[2], a[3]);
+ const __m128i rounding = _mm_set1_epi16(1 << (kFilterBits - 1));
+ a[0] = _mm_subs_epi16(rounding, a[0]);
+ a[1] = _mm_subs_epi16(rounding, a[1]);
+ a[0] = _mm_srai_epi16(a[0], kFilterBits);
+ a[1] = _mm_srai_epi16(a[1], kFilterBits);
+ StoreAligned16(dst_ptr, _mm_packus_epi16(a[0], a[1]));
+ dst_ptr += 16;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
+}
+
+void Init8bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
+#if DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperResCoefficients)
+#if DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+ dsp->super_res = SuperRes_SSE4_1;
+#endif // DSP_ENABLED_8BPP_SSE4_1(SuperRes)
+}
+
+} // namespace
+} // namespace low_bitdepth
+
+//------------------------------------------------------------------------------
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+// Upscale_Filter as defined in AV1 Section 7.16
alignas(16) const int16_t
kUpscaleFilter[kSuperResFilterShifts][kSuperResFilterTaps] = {
- {-0, 0, -0, 128, 0, -0, 0, -0}, {-0, 0, -1, 128, 2, -1, 0, -0},
- {-0, 1, -3, 127, 4, -2, 1, -0}, {-0, 1, -4, 127, 6, -3, 1, -0},
- {-0, 2, -6, 126, 8, -3, 1, -0}, {-0, 2, -7, 125, 11, -4, 1, -0},
- {-1, 2, -8, 125, 13, -5, 2, -0}, {-1, 3, -9, 124, 15, -6, 2, -0},
+ {0, 0, 0, 128, 0, 0, 0, 0}, {0, 0, -1, 128, 2, -1, 0, 0},
+ {0, 1, -3, 127, 4, -2, 1, 0}, {0, 1, -4, 127, 6, -3, 1, 0},
+ {0, 2, -6, 126, 8, -3, 1, 0}, {0, 2, -7, 125, 11, -4, 1, 0},
+ {-1, 2, -8, 125, 13, -5, 2, 0}, {-1, 3, -9, 124, 15, -6, 2, 0},
{-1, 3, -10, 123, 18, -6, 2, -1}, {-1, 3, -11, 122, 20, -7, 3, -1},
{-1, 4, -12, 121, 22, -8, 3, -1}, {-1, 4, -13, 120, 25, -9, 3, -1},
{-1, 4, -14, 118, 28, -9, 3, -1}, {-1, 4, -15, 117, 30, -10, 4, -1},
@@ -60,91 +203,111 @@
{-1, 4, -11, 32, 116, -16, 5, -1}, {-1, 4, -10, 30, 117, -15, 4, -1},
{-1, 3, -9, 28, 118, -14, 4, -1}, {-1, 3, -9, 25, 120, -13, 4, -1},
{-1, 3, -8, 22, 121, -12, 4, -1}, {-1, 3, -7, 20, 122, -11, 3, -1},
- {-1, 2, -6, 18, 123, -10, 3, -1}, {-0, 2, -6, 15, 124, -9, 3, -1},
- {-0, 2, -5, 13, 125, -8, 2, -1}, {-0, 1, -4, 11, 125, -7, 2, -0},
- {-0, 1, -3, 8, 126, -6, 2, -0}, {-0, 1, -3, 6, 127, -4, 1, -0},
- {-0, 1, -2, 4, 127, -3, 1, -0}, {-0, 0, -1, 2, 128, -1, 0, -0},
+ {-1, 2, -6, 18, 123, -10, 3, -1}, {0, 2, -6, 15, 124, -9, 3, -1},
+ {0, 2, -5, 13, 125, -8, 2, -1}, {0, 1, -4, 11, 125, -7, 2, 0},
+ {0, 1, -3, 8, 126, -6, 2, 0}, {0, 1, -3, 6, 127, -4, 1, 0},
+ {0, 1, -2, 4, 127, -3, 1, 0}, {0, 0, -1, 2, 128, -1, 0, 0},
};
-inline void ComputeSuperRes4(const uint8_t* src, uint8_t* dst_x, int step,
- int* p) {
- __m128i weighted_src[4];
- for (int i = 0; i < 4; ++i, *p += step) {
- const __m128i src_x = LoadLo8(&src[*p >> kSuperResScaleBits]);
- const int remainder = *p & kSuperResScaleMask;
- const __m128i filter =
- LoadUnaligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
- weighted_src[i] = _mm_madd_epi16(_mm_cvtepu8_epi16(src_x), filter);
- }
-
- // Pairwise add is chosen in favor of transpose and add because of the
- // ability to take advantage of madd.
- const __m128i res0 = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
- const __m128i res1 = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
- const __m128i result0 = _mm_hadd_epi32(res0, res1);
- const __m128i result = _mm_packus_epi32(
- RightShiftWithRounding_S32(result0, kFilterBits), result0);
- Store4(dst_x, _mm_packus_epi16(result, result));
+void SuperResCoefficients_SSE4_1(const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const coefficients) {
+ auto* dst = static_cast<uint16_t*>(coefficients);
+ int subpixel_x = initial_subpixel_x;
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ for (int i = 0; i < 8; ++i, dst += 8) {
+ int remainder = subpixel_x & kSuperResScaleMask;
+ __m128i filter =
+ LoadAligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
+ subpixel_x += step;
+ StoreAligned16(dst, filter);
+ }
+ } while (--x != 0);
}
-inline void ComputeSuperRes8(const uint8_t* src, uint8_t* dst_x, int step,
- int* p) {
- __m128i weighted_src[8];
- for (int i = 0; i < 8; ++i, *p += step) {
- const __m128i src_x = LoadLo8(&src[*p >> kSuperResScaleBits]);
- const int remainder = *p & kSuperResScaleMask;
- const __m128i filter =
- LoadUnaligned16(kUpscaleFilter[remainder >> kSuperResExtraBits]);
- weighted_src[i] = _mm_madd_epi16(_mm_cvtepu8_epi16(src_x), filter);
- }
+template <int bitdepth>
+void SuperRes_SSE4_1(const void* const coefficients, void* const source,
+ const ptrdiff_t source_stride, const int height,
+ const int downscaled_width, const int upscaled_width,
+ const int initial_subpixel_x, const int step,
+ void* const dest, const ptrdiff_t dest_stride) {
+ auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
+ auto* dst = static_cast<uint16_t*>(dest);
+ int y = height;
+ do {
+ const auto* filter = static_cast<const uint16_t*>(coefficients);
+ uint16_t* dst_ptr = dst;
+ ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
+ kSuperResHorizontalBorder, kSuperResHorizontalPadding);
+ int subpixel_x = initial_subpixel_x;
+ // The below code calculates up to 7 extra upscaled
+ // pixels which will over-read up to 7 downscaled pixels in the end of each
+ // row. kSuperResHorizontalPadding accounts for this.
+ int x = RightShiftWithCeiling(upscaled_width, 3);
+ do {
+ __m128i weighted_src[8];
+ for (int i = 0; i < 8; ++i, filter += 8) {
+ const __m128i s =
+ LoadUnaligned16(&src[subpixel_x >> kSuperResScaleBits]);
+ subpixel_x += step;
+ const __m128i f = LoadAligned16(filter);
+ weighted_src[i] = _mm_madd_epi16(s, f);
+ }
- // Pairwise add is chosen in favor of transpose and add because of the
- // ability to take advantage of madd.
- const __m128i res0 = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
- const __m128i res1 = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
- const __m128i res2 = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
- const __m128i res3 = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
- const __m128i result0 = _mm_hadd_epi32(res0, res1);
- const __m128i result1 = _mm_hadd_epi32(res2, res3);
- const __m128i result =
- _mm_packus_epi32(RightShiftWithRounding_S32(result0, kFilterBits),
- RightShiftWithRounding_S32(result1, kFilterBits));
- StoreLo8(dst_x, _mm_packus_epi16(result, result));
+ __m128i a[4];
+ a[0] = _mm_hadd_epi32(weighted_src[0], weighted_src[1]);
+ a[1] = _mm_hadd_epi32(weighted_src[2], weighted_src[3]);
+ a[2] = _mm_hadd_epi32(weighted_src[4], weighted_src[5]);
+ a[3] = _mm_hadd_epi32(weighted_src[6], weighted_src[7]);
+
+ a[0] = _mm_hadd_epi32(a[0], a[1]);
+ a[1] = _mm_hadd_epi32(a[2], a[3]);
+ a[0] = RightShiftWithRounding_S32(a[0], kFilterBits);
+ a[1] = RightShiftWithRounding_S32(a[1], kFilterBits);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(
+ _mm_packus_epi32(a[0], a[1]), _mm_set1_epi16((1 << bitdepth) - 1));
+ StoreAligned16(dst_ptr, clipped_16);
+ dst_ptr += 8;
+ } while (--x != 0);
+ src += source_stride;
+ dst += dest_stride;
+ } while (--y != 0);
}
-void ComputeSuperRes_SSE4_1(const void* source, const int upscaled_width,
- const int initial_subpixel_x, const int step,
- void* const dest) {
- const auto* src = static_cast<const uint8_t*>(source);
- auto* dst = static_cast<uint8_t*>(dest);
- src -= kSuperResFilterTaps >> 1;
-
- int p = initial_subpixel_x;
- int x = 0;
- for (; x < (upscaled_width & ~7); x += 8) {
- ComputeSuperRes8(src, &dst[x], step, &p);
- }
- // The below code can overwrite at most 3 bytes and overread at most 7.
- // kSuperResHorizontalBorder accounts for this.
- for (; x < upscaled_width; x += 4) {
- ComputeSuperRes4(src, &dst[x], step, &p);
- }
-}
-
-void Init8bpp() {
- Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
- dsp->super_res_row = ComputeSuperRes_SSE4_1;
+void Init10bpp() {
+ Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ static_cast<void>(dsp);
+#if DSP_ENABLED_10BPP_SSE4_1(SuperResCoefficients)
+ dsp->super_res_coefficients = SuperResCoefficients_SSE4_1;
+#else
+ static_cast<void>(SuperResCoefficients_SSE4_1);
+#endif
+#if DSP_ENABLED_10BPP_SSE4_1(SuperRes)
+ dsp->super_res = SuperRes_SSE4_1<10>;
+#else
+ static_cast<void>(SuperRes_SSE4_1);
+#endif
}
} // namespace
-} // namespace low_bitdepth
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
-void SuperResInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+void SuperResInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -153,4 +316,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/super_res_sse4.h b/libgav1/src/dsp/x86/super_res_sse4.h
index 5673ca5..07a7ef4 100644
--- a/libgav1/src/dsp/x86/super_res_sse4.h
+++ b/libgav1/src/dsp/x86/super_res_sse4.h
@@ -29,8 +29,22 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+#ifndef LIBGAV1_Dsp8bpp_SuperResCoefficients
+#define LIBGAV1_Dsp8bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_SuperRes
#define LIBGAV1_Dsp8bpp_SuperRes LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperResCoefficients
+#define LIBGAV1_Dsp10bpp_SuperResCoefficients LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_SuperRes
+#define LIBGAV1_Dsp10bpp_SuperRes LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_SUPER_RES_SSE4_H_
diff --git a/libgav1/src/dsp/x86/transpose_sse4.h b/libgav1/src/dsp/x86/transpose_sse4.h
index cd61c92..9726495 100644
--- a/libgav1/src/dsp/x86/transpose_sse4.h
+++ b/libgav1/src/dsp/x86/transpose_sse4.h
@@ -20,12 +20,46 @@
#include "src/utils/compiler_attributes.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <emmintrin.h>
namespace libgav1 {
namespace dsp {
+LIBGAV1_ALWAYS_INLINE void Transpose2x16_U16(const __m128i* const in,
+ __m128i* const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 10 11 20 21 30 31
+ // in[1]: 40 41 50 51 60 61 70 71
+ // in[2]: 80 81 90 91 a0 a1 b0 b1
+ // in[3]: c0 c1 d0 d1 e0 e1 f0 f1
+ // to:
+ // a0: 00 40 01 41 10 50 11 51
+ // a1: 20 60 21 61 30 70 31 71
+ // a2: 80 c0 81 c1 90 d0 91 d1
+ // a3: a0 e0 a1 e1 b0 f0 b1 f1
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a3 = _mm_unpackhi_epi16(in[2], in[3]);
+ // b0: 00 20 40 60 01 21 41 61
+ // b1: 10 30 50 70 11 31 51 71
+ // b2: 80 a0 c0 e0 81 a1 c1 e1
+ // b3: 90 b0 d0 f0 91 b1 d1 f1
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 80 90 a0 b0 c0 d0 e0 f0
+ // out[3]: 81 91 a1 b1 c1 d1 e1 f1
+ out[0] = _mm_unpacklo_epi16(b0, b1);
+ out[1] = _mm_unpackhi_epi16(b0, b1);
+ out[2] = _mm_unpacklo_epi16(b2, b3);
+ out[3] = _mm_unpackhi_epi16(b2, b3);
+}
+
LIBGAV1_ALWAYS_INLINE __m128i Transpose4x4_U8(const __m128i* const in) {
// Unpack 8 bit elements. Goes from:
// in[0]: 00 01 02 03
@@ -269,5 +303,5 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_TRANSPOSE_SSE4_H_
diff --git a/libgav1/src/dsp/x86/warp_sse4.cc b/libgav1/src/dsp/x86/warp_sse4.cc
index 4c9e716..9ddfeac 100644
--- a/libgav1/src/dsp/x86/warp_sse4.cc
+++ b/libgav1/src/dsp/x86/warp_sse4.cc
@@ -15,7 +15,7 @@
#include "src/dsp/warp.h"
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -513,7 +513,7 @@
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -522,4 +522,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/warp_sse4.h b/libgav1/src/dsp/x86/warp_sse4.h
index 51fbf43..a2dc5ca 100644
--- a/libgav1/src/dsp/x86/warp_sse4.h
+++ b/libgav1/src/dsp/x86/warp_sse4.h
@@ -29,9 +29,16 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_Warp
#define LIBGAV1_Dsp8bpp_Warp LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WarpCompound
#define LIBGAV1_Dsp8bpp_WarpCompound LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_X86_WARP_SSE4_H_
diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.cc b/libgav1/src/dsp/x86/weight_mask_sse4.cc
index 9d9d9c4..08a1739 100644
--- a/libgav1/src/dsp/x86/weight_mask_sse4.cc
+++ b/libgav1/src/dsp/x86/weight_mask_sse4.cc
@@ -16,7 +16,7 @@
#include "src/utils/cpu.h"
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
#include <smmintrin.h>
@@ -36,47 +36,65 @@
constexpr int kRoundingBits8bpp = 4;
-template <bool mask_is_inverse>
-inline void WeightMask8_SSE4(const int16_t* prediction_0,
- const int16_t* prediction_1, uint8_t* mask) {
- const __m128i pred_0 = LoadAligned16(prediction_0);
- const __m128i pred_1 = LoadAligned16(prediction_1);
- const __m128i difference = RightShiftWithRounding_U16(
- _mm_abs_epi16(_mm_sub_epi16(pred_0, pred_1)), kRoundingBits8bpp);
- const __m128i scaled_difference = _mm_srli_epi16(difference, 4);
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_SSE4(const int16_t* prediction_0,
+ const int16_t* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i difference_0 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_00, pred_10)), kRoundingBits8bpp);
+ const __m128i scaled_difference_0 = _mm_srli_epi16(difference_0, 4);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i difference_1 = RightShiftWithRounding_U16(
+ _mm_abs_epi16(_mm_sub_epi16(pred_01, pred_11)), kRoundingBits8bpp);
+ const __m128i scaled_difference_1 = _mm_srli_epi16(difference_1, 4);
+
const __m128i difference_offset = _mm_set1_epi8(38);
const __m128i adjusted_difference =
- _mm_adds_epu8(_mm_packus_epi16(scaled_difference, scaled_difference),
+ _mm_adds_epu8(_mm_packus_epi16(scaled_difference_0, scaled_difference_1),
difference_offset);
const __m128i mask_ceiling = _mm_set1_epi8(64);
const __m128i mask_value = _mm_min_epi8(adjusted_difference, mask_ceiling);
if (mask_is_inverse) {
const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
- StoreLo8(mask, inverted_mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
} else {
- StoreLo8(mask, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
}
}
-#define WEIGHT8_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask)
+#define WEIGHT8_PAIR_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, mask_stride)
-#define WEIGHT8_AND_STRIDE \
- WEIGHT8_WITHOUT_STRIDE; \
- pred_0 += 8; \
- pred_1 += 8; \
- mask += mask_stride
+#define WEIGHT8_PAIR_AND_STRIDE \
+ WEIGHT8_PAIR_WITHOUT_STRIDE; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
template <bool mask_is_inverse>
void WeightMask8x8_SSE4(const void* prediction_0, const void* prediction_1,
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y = 0;
- do {
- WEIGHT8_AND_STRIDE;
- } while (++y < 7);
- WEIGHT8_WITHOUT_STRIDE;
+
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
template <bool mask_is_inverse>
@@ -84,13 +102,13 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 3;
do {
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- } while (++y3 < 5);
- WEIGHT8_WITHOUT_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
template <bool mask_is_inverse>
@@ -98,21 +116,17 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 5;
do {
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- WEIGHT8_AND_STRIDE;
- } while (++y5 < 6);
- WEIGHT8_AND_STRIDE;
- WEIGHT8_WITHOUT_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ WEIGHT8_PAIR_AND_STRIDE;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE;
}
-#define WEIGHT16_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8)
+#define WEIGHT16_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride)
#define WEIGHT16_AND_STRIDE \
WEIGHT16_WITHOUT_STRIDE; \
@@ -125,10 +139,10 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y = 0;
+ int y = 7;
do {
WEIGHT16_AND_STRIDE;
- } while (++y < 7);
+ } while (--y != 0);
WEIGHT16_WITHOUT_STRIDE;
}
@@ -137,12 +151,12 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 5;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y3 < 5);
+ } while (--y3 != 0);
WEIGHT16_WITHOUT_STRIDE;
}
@@ -151,14 +165,14 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 6;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y5 < 6);
+ } while (--y5 != 0);
WEIGHT16_AND_STRIDE;
WEIGHT16_WITHOUT_STRIDE;
}
@@ -168,20 +182,19 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 21;
do {
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
WEIGHT16_AND_STRIDE;
- } while (++y3 < 21);
+ } while (--y3 != 0);
WEIGHT16_WITHOUT_STRIDE;
}
-#define WEIGHT32_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24)
+#define WEIGHT32_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
#define WEIGHT32_AND_STRIDE \
WEIGHT32_WITHOUT_STRIDE; \
@@ -209,12 +222,12 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 5;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y3 < 5);
+ } while (--y3 != 0);
WEIGHT32_WITHOUT_STRIDE;
}
@@ -223,14 +236,14 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y5 = 0;
+ int y5 = 6;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y5 < 6);
+ } while (--y5 != 0);
WEIGHT32_AND_STRIDE;
WEIGHT32_WITHOUT_STRIDE;
}
@@ -240,24 +253,23 @@
uint8_t* mask, ptrdiff_t mask_stride) {
const auto* pred_0 = static_cast<const int16_t*>(prediction_0);
const auto* pred_1 = static_cast<const int16_t*>(prediction_1);
- int y3 = 0;
+ int y3 = 21;
do {
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
WEIGHT32_AND_STRIDE;
- } while (++y3 < 21);
+ } while (--y3 != 0);
WEIGHT32_WITHOUT_STRIDE;
}
-#define WEIGHT64_WITHOUT_STRIDE \
- WeightMask8_SSE4<mask_is_inverse>(pred_0, pred_1, mask); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 8, pred_1 + 8, mask + 8); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 16, pred_1 + 16, mask + 16); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 24, pred_1 + 24, mask + 24); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 32, pred_1 + 32, mask + 32); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 40, pred_1 + 40, mask + 40); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 48, pred_1 + 48, mask + 48); \
- WeightMask8_SSE4<mask_is_inverse>(pred_0 + 56, pred_1 + 56, mask + 56)
+#define WEIGHT64_WITHOUT_STRIDE \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
#define WEIGHT64_AND_STRIDE \
WEIGHT64_WITHOUT_STRIDE; \
@@ -447,12 +459,491 @@
} // namespace
} // namespace low_bitdepth
-void WeightMaskInit_SSE4_1() { low_bitdepth::Init8bpp(); }
+#if LIBGAV1_MAX_BITDEPTH >= 10
+namespace high_bitdepth {
+namespace {
+
+constexpr int kRoundingBits10bpp = 6;
+constexpr int kScaledDiffShift = 4;
+
+template <bool mask_is_inverse, bool is_store_16>
+inline void WeightMask16_10bpp_SSE4(const uint16_t* prediction_0,
+ const uint16_t* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const __m128i diff_offset = _mm_set1_epi8(38);
+ const __m128i mask_ceiling = _mm_set1_epi8(64);
+ const __m128i zero = _mm_setzero_si128();
+
+ // Range of prediction: [3988, 61532].
+ const __m128i pred_00 = LoadAligned16(prediction_0);
+ const __m128i pred_10 = LoadAligned16(prediction_1);
+ const __m128i pred_lo_00 = _mm_cvtepu16_epi32(pred_00);
+ const __m128i pred_lo_10 = _mm_cvtepu16_epi32(pred_10);
+ const __m128i diff_lo_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_00, pred_lo_10)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_00 = _mm_unpackhi_epi16(pred_00, zero);
+ const __m128i pred_hi_10 = _mm_unpackhi_epi16(pred_10, zero);
+ const __m128i diff_hi_0 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_00, pred_hi_10)), kRoundingBits10bpp);
+
+ const __m128i diff_0 = _mm_packus_epi32(diff_lo_0, diff_hi_0);
+ const __m128i scaled_diff_0 = _mm_srli_epi16(diff_0, kScaledDiffShift);
+
+ const __m128i pred_01 = LoadAligned16(prediction_0 + 8);
+ const __m128i pred_11 = LoadAligned16(prediction_1 + 8);
+ const __m128i pred_lo_01 = _mm_cvtepu16_epi32(pred_01);
+ const __m128i pred_lo_11 = _mm_cvtepu16_epi32(pred_11);
+ const __m128i diff_lo_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_lo_01, pred_lo_11)), kRoundingBits10bpp);
+
+ const __m128i pred_hi_01 = _mm_unpackhi_epi16(pred_01, zero);
+ const __m128i pred_hi_11 = _mm_unpackhi_epi16(pred_11, zero);
+ const __m128i diff_hi_1 = RightShiftWithRounding_U32(
+ _mm_abs_epi32(_mm_sub_epi32(pred_hi_01, pred_hi_11)), kRoundingBits10bpp);
+
+ const __m128i diff_1 = _mm_packus_epi32(diff_lo_1, diff_hi_1);
+ const __m128i scaled_diff_1 = _mm_srli_epi16(diff_1, kScaledDiffShift);
+
+ const __m128i adjusted_diff = _mm_adds_epu8(
+ _mm_packus_epi16(scaled_diff_0, scaled_diff_1), diff_offset);
+ const __m128i mask_value = _mm_min_epi8(adjusted_diff, mask_ceiling);
+
+ if (mask_is_inverse) {
+ const __m128i inverted_mask_value = _mm_sub_epi8(mask_ceiling, mask_value);
+ if (is_store_16) {
+ StoreAligned16(mask, inverted_mask_value);
+ } else {
+ StoreLo8(mask, inverted_mask_value);
+ StoreHi8(mask + mask_stride, inverted_mask_value);
+ }
+ } else {
+ if (is_store_16) {
+ StoreAligned16(mask, mask_value);
+ } else {
+ StoreLo8(mask, mask_value);
+ StoreHi8(mask + mask_stride, mask_value);
+ }
+ }
+}
+
+#define WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, false>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT8_PAIR_AND_STRIDE_10BPP \
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 8 << 1; \
+ pred_1 += 8 << 1; \
+ mask += mask_stride << 1
+
+template <bool mask_is_inverse>
+void WeightMask8x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 3;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask8x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 5;
+ do {
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ WEIGHT8_PAIR_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT8_PAIR_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT16_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride)
+
+#define WEIGHT16_AND_STRIDE_10BPP \
+ WEIGHT16_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 16; \
+ pred_1 += 16; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask16x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y = 7;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask16x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ WEIGHT16_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT16_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT32_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride)
+
+#define WEIGHT32_AND_STRIDE_10BPP \
+ WEIGHT32_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 32; \
+ pred_1 += 32; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask32x8_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask32x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ WEIGHT32_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT32_WITHOUT_STRIDE_10BPP;
+}
+
+#define WEIGHT64_WITHOUT_STRIDE_10BPP \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0, pred_1, mask, \
+ mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 16, pred_1 + 16, \
+ mask + 16, mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 32, pred_1 + 32, \
+ mask + 32, mask_stride); \
+ WeightMask16_10bpp_SSE4<mask_is_inverse, true>(pred_0 + 48, pred_1 + 48, \
+ mask + 48, mask_stride)
+
+#define WEIGHT64_AND_STRIDE_10BPP \
+ WEIGHT64_WITHOUT_STRIDE_10BPP; \
+ pred_0 += 64; \
+ pred_1 += 64; \
+ mask += mask_stride
+
+template <bool mask_is_inverse>
+void WeightMask64x16_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 5;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x32_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y5 = 6;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y5 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask64x128_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ do {
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_AND_STRIDE_10BPP;
+ } while (--y3 != 0);
+ WEIGHT64_AND_STRIDE_10BPP;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x64_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 21;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+template <bool mask_is_inverse>
+void WeightMask128x128_10bpp_SSE4(const void* prediction_0,
+ const void* prediction_1, uint8_t* mask,
+ ptrdiff_t mask_stride) {
+ const auto* pred_0 = static_cast<const uint16_t*>(prediction_0);
+ const auto* pred_1 = static_cast<const uint16_t*>(prediction_1);
+ int y3 = 42;
+ const ptrdiff_t adjusted_mask_stride = mask_stride - 64;
+ do {
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+ } while (--y3 != 0);
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += adjusted_mask_stride;
+
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+ pred_0 += 64;
+ pred_1 += 64;
+ mask += 64;
+ WEIGHT64_WITHOUT_STRIDE_10BPP;
+}
+
+#define INIT_WEIGHT_MASK_10BPP(width, height, w_index, h_index) \
+ dsp->weight_mask[w_index][h_index][0] = \
+ WeightMask##width##x##height##_10bpp_SSE4<0>; \
+ dsp->weight_mask[w_index][h_index][1] = \
+ WeightMask##width##x##height##_10bpp_SSE4<1>
+void Init10bpp() {
+ Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
+ assert(dsp != nullptr);
+ INIT_WEIGHT_MASK_10BPP(8, 8, 0, 0);
+ INIT_WEIGHT_MASK_10BPP(8, 16, 0, 1);
+ INIT_WEIGHT_MASK_10BPP(8, 32, 0, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 8, 1, 0);
+ INIT_WEIGHT_MASK_10BPP(16, 16, 1, 1);
+ INIT_WEIGHT_MASK_10BPP(16, 32, 1, 2);
+ INIT_WEIGHT_MASK_10BPP(16, 64, 1, 3);
+ INIT_WEIGHT_MASK_10BPP(32, 8, 2, 0);
+ INIT_WEIGHT_MASK_10BPP(32, 16, 2, 1);
+ INIT_WEIGHT_MASK_10BPP(32, 32, 2, 2);
+ INIT_WEIGHT_MASK_10BPP(32, 64, 2, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 16, 3, 1);
+ INIT_WEIGHT_MASK_10BPP(64, 32, 3, 2);
+ INIT_WEIGHT_MASK_10BPP(64, 64, 3, 3);
+ INIT_WEIGHT_MASK_10BPP(64, 128, 3, 4);
+ INIT_WEIGHT_MASK_10BPP(128, 64, 4, 3);
+ INIT_WEIGHT_MASK_10BPP(128, 128, 4, 4);
+}
+
+} // namespace
+} // namespace high_bitdepth
+#endif // LIBGAV1_MAX_BITDEPTH >= 10
+
+void WeightMaskInit_SSE4_1() {
+ low_bitdepth::Init8bpp();
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ high_bitdepth::Init10bpp();
+#endif
+}
} // namespace dsp
} // namespace libgav1
-#else // !LIBGAV1_ENABLE_SSE4_1
+#else // !LIBGAV1_TARGETING_SSE4_1
namespace libgav1 {
namespace dsp {
@@ -461,4 +952,4 @@
} // namespace dsp
} // namespace libgav1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif // LIBGAV1_TARGETING_SSE4_1
diff --git a/libgav1/src/dsp/x86/weight_mask_sse4.h b/libgav1/src/dsp/x86/weight_mask_sse4.h
index 841dd5a..e5d9d70 100644
--- a/libgav1/src/dsp/x86/weight_mask_sse4.h
+++ b/libgav1/src/dsp/x86/weight_mask_sse4.h
@@ -29,24 +29,143 @@
} // namespace dsp
} // namespace libgav1
-#if LIBGAV1_ENABLE_SSE4_1
+#if LIBGAV1_TARGETING_SSE4_1
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x8
#define LIBGAV1_Dsp8bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x16
#define LIBGAV1_Dsp8bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_8x32
#define LIBGAV1_Dsp8bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x8
#define LIBGAV1_Dsp8bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x16
#define LIBGAV1_Dsp8bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x32
#define LIBGAV1_Dsp8bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_16x64
#define LIBGAV1_Dsp8bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x8
#define LIBGAV1_Dsp8bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x16
#define LIBGAV1_Dsp8bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x32
#define LIBGAV1_Dsp8bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_32x64
#define LIBGAV1_Dsp8bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x16
#define LIBGAV1_Dsp8bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x32
#define LIBGAV1_Dsp8bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x64
#define LIBGAV1_Dsp8bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_64x128
#define LIBGAV1_Dsp8bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x64
#define LIBGAV1_Dsp8bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp8bpp_WeightMask_128x128
#define LIBGAV1_Dsp8bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
-#endif // LIBGAV1_ENABLE_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x8
+#define LIBGAV1_Dsp10bpp_WeightMask_8x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x16
+#define LIBGAV1_Dsp10bpp_WeightMask_8x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_8x32
+#define LIBGAV1_Dsp10bpp_WeightMask_8x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x8
+#define LIBGAV1_Dsp10bpp_WeightMask_16x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x16
+#define LIBGAV1_Dsp10bpp_WeightMask_16x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x32
+#define LIBGAV1_Dsp10bpp_WeightMask_16x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_16x64
+#define LIBGAV1_Dsp10bpp_WeightMask_16x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x8
+#define LIBGAV1_Dsp10bpp_WeightMask_32x8 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x16
+#define LIBGAV1_Dsp10bpp_WeightMask_32x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x32
+#define LIBGAV1_Dsp10bpp_WeightMask_32x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_32x64
+#define LIBGAV1_Dsp10bpp_WeightMask_32x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x16
+#define LIBGAV1_Dsp10bpp_WeightMask_64x16 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x32
+#define LIBGAV1_Dsp10bpp_WeightMask_64x32 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x64
+#define LIBGAV1_Dsp10bpp_WeightMask_64x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_64x128
+#define LIBGAV1_Dsp10bpp_WeightMask_64x128 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x64
+#define LIBGAV1_Dsp10bpp_WeightMask_128x64 LIBGAV1_CPU_SSE4_1
+#endif
+
+#ifndef LIBGAV1_Dsp10bpp_WeightMask_128x128
+#define LIBGAV1_Dsp10bpp_WeightMask_128x128 LIBGAV1_CPU_SSE4_1
+#endif
+#endif // LIBGAV1_TARGETING_SSE4_1
#endif // LIBGAV1_SRC_DSP_ARM_WEIGHT_MASK_SSE4_H_
diff --git a/libgav1/src/film_grain.cc b/libgav1/src/film_grain.cc
index 15ae956..dac37b5 100644
--- a/libgav1/src/film_grain.cc
+++ b/libgav1/src/film_grain.cc
@@ -433,7 +433,7 @@
if (!is_monochrome_) {
noise_buffer_size += 2 * max_luma_num *
(kNoiseStripeHeight >> subsampling_y_) *
- RightShiftWithRounding(width_, subsampling_x_);
+ SubsampledValue(width_, subsampling_x_);
}
noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
if (noise_buffer_ == nullptr) return false;
@@ -444,18 +444,16 @@
noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
}
if (!is_monochrome_) {
- noise_stripes_[kPlaneU].Reset(
- max_luma_num,
- (kNoiseStripeHeight >> subsampling_y_) *
- RightShiftWithRounding(width_, subsampling_x_),
- noise_buffer);
+ noise_stripes_[kPlaneU].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
- RightShiftWithRounding(width_, subsampling_x_);
- noise_stripes_[kPlaneV].Reset(
- max_luma_num,
- (kNoiseStripeHeight >> subsampling_y_) *
- RightShiftWithRounding(width_, subsampling_x_),
- noise_buffer);
+ SubsampledValue(width_, subsampling_x_);
+ noise_stripes_[kPlaneV].Reset(max_luma_num,
+ (kNoiseStripeHeight >> subsampling_y_) *
+ SubsampledValue(width_, subsampling_x_),
+ noise_buffer);
}
return true;
}
@@ -715,8 +713,8 @@
planes_to_blend[num_planes++] = kPlaneU;
planes_to_blend[num_planes++] = kPlaneV;
} else {
- const int height_uv = RightShiftWithRounding(height_, subsampling_y_);
- const int width_uv = RightShiftWithRounding(width_, subsampling_x_);
+ const int height_uv = SubsampledValue(height_, subsampling_y_);
+ const int width_uv = SubsampledValue(width_, subsampling_x_);
// Noise is applied according to a lookup table defined by pieceiwse
// linear "points." If the lookup table is empty, that corresponds to
diff --git a/libgav1/src/film_grain.h b/libgav1/src/film_grain.h
index 6757214..b588f6d 100644
--- a/libgav1/src/film_grain.h
+++ b/libgav1/src/film_grain.h
@@ -178,8 +178,8 @@
//
// noise_stripes_[kPlaneU][luma_num] or noise_stripes_[kPlaneV][luma_num]
// is an array that has (34 >> subsampling_y_) rows and
- // RightShiftWithRounding(width_, subsampling_x_) columns and contains noise
- // for the chroma components.
+ // SubsampledValue(width_, subsampling_x_) columns and contains noise for the
+ // chroma components.
Array2DView<GrainType> noise_stripes_[kMaxPlanes];
// Owns the memory that the elements of noise_stripes_ point to.
std::unique_ptr<GrainType[]> noise_buffer_;
diff --git a/libgav1/src/frame_scratch_buffer.h b/libgav1/src/frame_scratch_buffer.h
index 1d6a1f4..90c3bb8 100644
--- a/libgav1/src/frame_scratch_buffer.h
+++ b/libgav1/src/frame_scratch_buffer.h
@@ -54,20 +54,18 @@
TemporalMotionField motion_field;
SymbolDecoderContext symbol_decoder_context;
std::unique_ptr<ResidualBufferPool> residual_buffer_pool;
- // threaded_window_buffer will be subdivided by PostFilter into windows of
- // width 512 pixels. Each row in the window is filtered by a worker thread.
- // To avoid false sharing, each 512-pixel row processed by one thread should
- // not share a cache line with a row processed by another thread. So we align
- // threaded_window_buffer to the cache line size. In addition, it is faster to
- // memcpy from an aligned buffer.
- AlignedDynamicBuffer<uint8_t, kCacheLineSize> threaded_window_buffer;
+ // Buffer used to store the cdef borders. This buffer will store 4 rows for
+ // every 64x64 block (4 rows for every 32x32 for chroma with subsampling). The
+ // indices of the rows that are stored are specified in |kCdefBorderRows|.
+ YuvBuffer cdef_border;
+ AlignedDynamicBuffer<uint8_t, 16> superres_coefficients[kNumPlaneTypes];
// Buffer used to temporarily store the input row for applying SuperRes.
- AlignedDynamicBuffer<uint8_t, 16> superres_line_buffer;
- // Buffer used to store the deblocked pixels that are necessary for loop
- // restoration. This buffer will store 4 rows for every 64x64 block (4 rows
- // for every 32x32 for chroma with subsampling). The indices of the rows that
- // are stored are specified in |kDeblockedRowsForLoopRestoration|.
- YuvBuffer deblock_buffer;
+ YuvBuffer superres_line_buffer;
+ // Buffer used to store the loop restoration borders. This buffer will store 4
+ // rows for every 64x64 block (4 rows for every 32x32 for chroma with
+ // subsampling). The indices of the rows that are stored are specified in
+ // |kLoopRestorationBorderRows|.
+ YuvBuffer loop_restoration_border;
// The size of this dynamic buffer is |tile_rows|.
DynamicBuffer<IntraPredictionBuffer> intra_prediction_buffers;
TileScratchBufferPool tile_scratch_buffer_pool;
diff --git a/libgav1/src/gav1/decoder_settings.h b/libgav1/src/gav1/decoder_settings.h
index ab22a4d..7ee487f 100644
--- a/libgav1/src/gav1/decoder_settings.h
+++ b/libgav1/src/gav1/decoder_settings.h
@@ -62,7 +62,8 @@
Libgav1GetFrameBufferCallback get_frame_buffer;
// Release frame buffer callback.
Libgav1ReleaseFrameBufferCallback release_frame_buffer;
- // Release input frame buffer callback.
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
Libgav1ReleaseInputBufferCallback release_input_buffer;
// Passed as the private_data argument to the callbacks.
void* callback_private_data;
@@ -117,7 +118,8 @@
GetFrameBufferCallback get_frame_buffer = nullptr;
// Release frame buffer callback.
ReleaseFrameBufferCallback release_frame_buffer = nullptr;
- // Release input frame buffer callback.
+ // Release input frame buffer callback. This callback must be set when
+ // |frame_parallel| is true.
ReleaseInputBufferCallback release_input_buffer = nullptr;
// Passed as the private_data argument to the callbacks.
void* callback_private_data = nullptr;
diff --git a/libgav1/src/gav1/symbol_visibility.h b/libgav1/src/gav1/symbol_visibility.h
index ad7498c..116a514 100644
--- a/libgav1/src/gav1/symbol_visibility.h
+++ b/libgav1/src/gav1/symbol_visibility.h
@@ -58,6 +58,11 @@
//
// Much of the above information and more can be found at
// https://gcc.gnu.org/wiki/Visibility
+//
+// NOTE: A third-party build system for libgav1 can add -DLIBGAV1_PUBLIC= to the
+// compiler command line to override the definition of LIBGAV1_PUBLIC in this
+// header. This can be used to create a libgav1 static library that will not
+// export any symbols when it is linked into a shared library.
#if !defined(LIBGAV1_PUBLIC)
#if defined(_WIN32)
@@ -76,7 +81,7 @@
#else
#define LIBGAV1_PUBLIC
#endif // defined(LIBGAV1_BUILDING_DLL) && LIBGAV1_BUILDING_DLL
-#else
+#else // !defined(_WIN32)
#if defined(__GNUC__) && __GNUC__ >= 4
#define LIBGAV1_PUBLIC __attribute__((visibility("default")))
#else
diff --git a/libgav1/src/gav1/version.h b/libgav1/src/gav1/version.h
index e78e9a7..c018928 100644
--- a/libgav1/src/gav1/version.h
+++ b/libgav1/src/gav1/version.h
@@ -24,7 +24,7 @@
#define LIBGAV1_MAJOR_VERSION 0
#define LIBGAV1_MINOR_VERSION 16
-#define LIBGAV1_PATCH_VERSION 0
+#define LIBGAV1_PATCH_VERSION 3
#define LIBGAV1_VERSION \
((LIBGAV1_MAJOR_VERSION << 16) | (LIBGAV1_MINOR_VERSION << 8) | \
diff --git a/libgav1/src/loop_restoration_info.cc b/libgav1/src/loop_restoration_info.cc
index 3830836..2dba57d 100644
--- a/libgav1/src/loop_restoration_info.cc
+++ b/libgav1/src/loop_restoration_info.cc
@@ -70,18 +70,16 @@
continue;
}
plane_needs_filtering_[plane] = true;
- const int plane_width = (plane == kPlaneY)
- ? width
- : RightShiftWithRounding(width, subsampling_x_);
+ const int plane_width =
+ (plane == kPlaneY) ? width : SubsampledValue(width, subsampling_x_);
const int plane_height =
- (plane == kPlaneY) ? height
- : RightShiftWithRounding(height, subsampling_y_);
- num_horizontal_units_[plane] = std::max(
- 1, (plane_width + DivideBy2(loop_restoration_->unit_size[plane])) /
- loop_restoration_->unit_size[plane]);
+ (plane == kPlaneY) ? height : SubsampledValue(height, subsampling_y_);
+ num_horizontal_units_[plane] =
+ std::max(1, RightShiftWithRounding(
+ plane_width, loop_restoration_->unit_size_log2[plane]));
num_vertical_units_[plane] = std::max(
- 1, (plane_height + DivideBy2(loop_restoration_->unit_size[plane])) /
- loop_restoration_->unit_size[plane]);
+ 1, RightShiftWithRounding(plane_height,
+ loop_restoration_->unit_size_log2[plane]));
num_units_[plane] =
num_horizontal_units_[plane] * num_vertical_units_[plane];
total_num_units += num_units_[plane];
@@ -109,29 +107,25 @@
LoopRestorationUnitInfo* const unit_info) const {
assert(unit_info != nullptr);
if (!plane_needs_filtering_[plane]) return false;
- const int denominator_column =
- is_superres_scaled
- ? loop_restoration_->unit_size[plane] * kSuperResScaleNumerator
- : loop_restoration_->unit_size[plane];
const int numerator_column =
is_superres_scaled ? superres_scale_denominator : 1;
const int pixel_column_start =
RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_);
const int pixel_column_end = RowOrColumn4x4ToPixel(
column4x4 + kNum4x4BlocksWide[block_size], plane, subsampling_x_);
- const int unit_row = loop_restoration_->unit_size[plane];
+ const int unit_row_log2 = loop_restoration_->unit_size_log2[plane];
+ const int denominator_column_log2 =
+ unit_row_log2 + (is_superres_scaled ? 3 : 0);
const int pixel_row_start =
RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_);
const int pixel_row_end = RowOrColumn4x4ToPixel(
row4x4 + kNum4x4BlocksHigh[block_size], plane, subsampling_y_);
- unit_info->column_start =
- (pixel_column_start * numerator_column + denominator_column - 1) /
- denominator_column;
- unit_info->column_end =
- (pixel_column_end * numerator_column + denominator_column - 1) /
- denominator_column;
- unit_info->row_start = (pixel_row_start + unit_row - 1) / unit_row;
- unit_info->row_end = (pixel_row_end + unit_row - 1) / unit_row;
+ unit_info->column_start = RightShiftWithCeiling(
+ pixel_column_start * numerator_column, denominator_column_log2);
+ unit_info->column_end = RightShiftWithCeiling(
+ pixel_column_end * numerator_column, denominator_column_log2);
+ unit_info->row_start = RightShiftWithCeiling(pixel_row_start, unit_row_log2);
+ unit_info->row_end = RightShiftWithCeiling(pixel_row_end, unit_row_log2);
unit_info->column_end =
std::min(unit_info->column_end, num_horizontal_units_[plane]);
unit_info->row_end = std::min(unit_info->row_end, num_vertical_units_[plane]);
diff --git a/libgav1/src/motion_vector.cc b/libgav1/src/motion_vector.cc
index 8223f3d..fdb1875 100644
--- a/libgav1/src/motion_vector.cc
+++ b/libgav1/src/motion_vector.cc
@@ -63,16 +63,12 @@
const ObuFrameHeader& frame_header = block.tile.frame_header();
ReferenceFrameType reference_type = bp.reference_frame[index];
const auto& gm = frame_header.global_motion[reference_type];
- GlobalMotionTransformationType global_motion_type =
- (reference_type != kReferenceFrameIntra)
- ? gm.type
- : kNumGlobalMotionTransformationTypes;
if (reference_type == kReferenceFrameIntra ||
- global_motion_type == kGlobalMotionTransformationTypeIdentity) {
+ gm.type == kGlobalMotionTransformationTypeIdentity) {
mv->mv32 = 0;
return;
}
- if (global_motion_type == kGlobalMotionTransformationTypeTranslation) {
+ if (gm.type == kGlobalMotionTransformationTypeTranslation) {
for (int i = 0; i < 2; ++i) {
mv->mv[i] = gm.params[i] >> (kWarpedModelPrecisionBits - 3);
}
@@ -127,18 +123,19 @@
*found_new_mv |= kPredictionModeNewMvMask.Contains(mv_bp.y_mode);
*found_match = true;
MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
- const auto result = std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+ const int num_found = *num_mv_found;
+ const auto result = std::find_if(ref_mv_stack, ref_mv_stack + num_found,
[&candidate_mv](const MotionVector& ref_mv) {
return ref_mv == candidate_mv;
});
- if (result != ref_mv_stack + *num_mv_found) {
+ if (result != ref_mv_stack + num_found) {
prediction_parameters.IncreaseWeight(std::distance(ref_mv_stack, result),
weight);
return;
}
- if (*num_mv_found >= kMaxRefMvStackSize) return;
- ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, weight);
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
++*num_mv_found;
}
@@ -163,19 +160,20 @@
*found_match = true;
CompoundMotionVector* const compound_ref_mv_stack =
prediction_parameters.compound_ref_mv_stack;
+ const int num_found = *num_mv_found;
const auto result =
- std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + *num_mv_found,
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
[&candidate_mv](const CompoundMotionVector& ref_mv) {
return ref_mv == candidate_mv;
});
- if (result != compound_ref_mv_stack + *num_mv_found) {
+ if (result != compound_ref_mv_stack + num_found) {
prediction_parameters.IncreaseWeight(
std::distance(compound_ref_mv_stack, result), weight);
return;
}
- if (*num_mv_found >= kMaxRefMvStackSize) return;
- compound_ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, weight);
+ if (num_found >= kMaxRefMvStackSize) return;
+ compound_ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, weight);
++*num_mv_found;
}
@@ -305,24 +303,26 @@
}
CompoundMotionVector* const compound_ref_mv_stack =
prediction_parameters->compound_ref_mv_stack;
+ int num_found = *num_mv_found;
int index = 0;
do {
const CompoundMotionVector& candidate_mv = candidate_mvs[index];
- const auto result = std::find_if(
- compound_ref_mv_stack, compound_ref_mv_stack + *num_mv_found,
- [&candidate_mv](const CompoundMotionVector& ref_mv) {
- return ref_mv == candidate_mv;
- });
- if (result != compound_ref_mv_stack + *num_mv_found) {
+ const auto result =
+ std::find_if(compound_ref_mv_stack, compound_ref_mv_stack + num_found,
+ [&candidate_mv](const CompoundMotionVector& ref_mv) {
+ return ref_mv == candidate_mv;
+ });
+ if (result != compound_ref_mv_stack + num_found) {
prediction_parameters->IncreaseWeight(
std::distance(compound_ref_mv_stack, result), 2);
continue;
}
- if (*num_mv_found >= kMaxRefMvStackSize) continue;
- compound_ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2);
- ++*num_mv_found;
+ if (num_found >= kMaxRefMvStackSize) continue;
+ compound_ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
} while (++index < count);
+ *num_mv_found = num_found;
return;
}
MotionVector* const ref_mv_stack = prediction_parameters->ref_mv_stack;
@@ -333,19 +333,20 @@
*zero_mv_context = static_cast<int>(max_difference >= 16);
}
const MotionVector candidate_mv = {};
+ const int num_found = *num_mv_found;
const auto result =
- std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
[&candidate_mv](const MotionVector& ref_mv) {
return ref_mv == candidate_mv;
});
- if (result != ref_mv_stack + *num_mv_found) {
+ if (result != ref_mv_stack + num_found) {
prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
2 * count);
return;
}
- if (*num_mv_found >= kMaxRefMvStackSize) return;
- ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2 * count);
+ if (num_found >= kMaxRefMvStackSize) return;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2 * count);
++*num_mv_found;
return;
}
@@ -361,24 +362,26 @@
std::abs(candidate_mvs[0].mv[1] - global_mv[0].mv[1]));
*zero_mv_context = static_cast<int>(max_difference >= 16);
}
+ int num_found = *num_mv_found;
int index = 0;
do {
const MotionVector& candidate_mv = candidate_mvs[index];
const auto result =
- std::find_if(ref_mv_stack, ref_mv_stack + *num_mv_found,
+ std::find_if(ref_mv_stack, ref_mv_stack + num_found,
[&candidate_mv](const MotionVector& ref_mv) {
return ref_mv == candidate_mv;
});
- if (result != ref_mv_stack + *num_mv_found) {
+ if (result != ref_mv_stack + num_found) {
prediction_parameters->IncreaseWeight(std::distance(ref_mv_stack, result),
2);
continue;
}
- if (*num_mv_found >= kMaxRefMvStackSize) continue;
- ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters->SetWeightIndexStackEntry(*num_mv_found, 2);
- ++*num_mv_found;
+ if (num_found >= kMaxRefMvStackSize) continue;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters->SetWeightIndexStackEntry(num_found, 2);
+ ++num_found;
} while (++index < count);
+ *num_mv_found = num_found;
}
// Part of 7.10.2.5.
@@ -397,9 +400,6 @@
kBlock32x8, kBlock32x16, kBlock32x32);
// 7.10.2.5.
-//
-// The |zero_mv_context| output parameter may be null. If |zero_mv_context| is
-// not null, the function may set |*zero_mv_context|.
void TemporalScan(const Tile::Block& block, bool is_compound,
int* const zero_mv_context, int* const num_mv_found) {
const int step_w = (block.width4x4 >= 16) ? 4 : 2;
@@ -552,6 +552,7 @@
PredictionParameters& prediction_parameters =
*block.bp->prediction_parameters;
MotionVector* const ref_mv_stack = prediction_parameters.ref_mv_stack;
+ int num_found = *num_mv_found;
for (int i = 0; i < 2; ++i) {
const ReferenceFrameType candidate_reference_frame = bp.reference_frame[i];
if (candidate_reference_frame <= kReferenceFrameIntra) continue;
@@ -561,15 +562,16 @@
candidate_mv.mv[0] *= -1;
candidate_mv.mv[1] *= -1;
}
- assert(*num_mv_found <= 2);
- if ((*num_mv_found != 0 && ref_mv_stack[0] == candidate_mv) ||
- (*num_mv_found == 2 && ref_mv_stack[1] == candidate_mv)) {
+ assert(num_found <= 2);
+ if ((num_found != 0 && ref_mv_stack[0] == candidate_mv) ||
+ (num_found == 2 && ref_mv_stack[1] == candidate_mv)) {
continue;
}
- ref_mv_stack[*num_mv_found] = candidate_mv;
- prediction_parameters.SetWeightIndexStackEntry(*num_mv_found, 0);
- ++*num_mv_found;
+ ref_mv_stack[num_found] = candidate_mv;
+ prediction_parameters.SetWeightIndexStackEntry(num_found, 0);
+ ++num_found;
}
+ *num_mv_found = num_found;
}
// 7.10.2.12.
diff --git a/libgav1/src/obu_parser.cc b/libgav1/src/obu_parser.cc
index 41df909..69480d7 100644
--- a/libgav1/src/obu_parser.cc
+++ b/libgav1/src/obu_parser.cc
@@ -188,6 +188,16 @@
color_config->color_range = kColorRangeFull;
color_config->subsampling_x = 0;
color_config->subsampling_y = 0;
+ // YUV 4:4:4 is only allowed in profile 1, or profile 2 with bit depth 12.
+ // See the table at the beginning of Section 6.4.1.
+ if (sequence_header->profile != kProfile1 &&
+ (sequence_header->profile != kProfile2 ||
+ color_config->bitdepth != 12)) {
+ LIBGAV1_DLOG(ERROR,
+ "YUV 4:4:4 is not allowed in profile %d for bitdepth %d.",
+ sequence_header->profile, color_config->bitdepth);
+ return false;
+ }
} else {
OBU_READ_BIT_OR_FAIL;
color_config->color_range = static_cast<ColorRange>(scratch);
@@ -469,9 +479,13 @@
LIBGAV1_DLOG(ERROR, "Sequence header changed in the middle of a frame.");
return false;
}
+ sequence_header_changed_ = true;
decoder_state_.ClearReferenceFrames();
}
sequence_header_ = sequence_header;
+ if (!has_sequence_header_) {
+ sequence_header_changed_ = true;
+ }
has_sequence_header_ = true;
// Section 6.4.1: It is a requirement of bitstream conformance that if
// OperatingPointIdc is equal to 0, then obu_extension_flag is equal to 0 for
@@ -499,12 +513,12 @@
if (lower_bound_is_smaller) {
if (reference_frame_id > decoder_state_.current_frame_id ||
reference_frame_id < lower_bound) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
} else {
if (reference_frame_id > decoder_state_.current_frame_id &&
reference_frame_id < lower_bound) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
}
}
@@ -611,7 +625,7 @@
frame_header_.reference_order_hint[i] = scratch;
if (frame_header_.reference_order_hint[i] !=
decoder_state_.reference_order_hint[i]) {
- decoder_state_.reference_valid[i] = false;
+ decoder_state_.reference_frame[i] = nullptr;
}
}
return true;
@@ -1149,8 +1163,7 @@
unit_shift += unit_extra_shift;
}
}
- loop_restoration->unit_size[kPlaneY] =
- kLoopRestorationTileSizeMax >> (2 - unit_shift);
+ loop_restoration->unit_size_log2[kPlaneY] = 6 + unit_shift;
uint8_t uv_shift = 0;
if (sequence_header_.color_config.subsampling_x != 0 &&
sequence_header_.color_config.subsampling_y != 0 &&
@@ -1158,9 +1171,9 @@
OBU_READ_BIT_OR_FAIL;
uv_shift = scratch;
}
- loop_restoration->unit_size[kPlaneU] =
- loop_restoration->unit_size[kPlaneV] =
- loop_restoration->unit_size[0] >> uv_shift;
+ loop_restoration->unit_size_log2[kPlaneU] =
+ loop_restoration->unit_size_log2[kPlaneV] =
+ loop_restoration->unit_size_log2[0] - uv_shift;
}
return true;
}
@@ -1778,10 +1791,11 @@
// whenever display_frame_id is read, the value matches
// RefFrameId[ frame_to_show_map_idx ] ..., and that
// RefValid[ frame_to_show_map_idx ] is equal to 1.
+ //
+ // The current_frame_ == nullptr check below is equivalent to checking
+ // if RefValid[ frame_to_show_map_idx ] is equal to 1.
if (frame_header_.display_frame_id !=
- decoder_state_
- .reference_frame_id[frame_header_.frame_to_show] ||
- !decoder_state_.reference_valid[frame_header_.frame_to_show]) {
+ decoder_state_.reference_frame_id[frame_header_.frame_to_show]) {
LIBGAV1_DLOG(ERROR,
"Reference buffer %d has a frame id number mismatch.",
frame_header_.frame_to_show);
@@ -1859,8 +1873,8 @@
}
}
if (frame_header_.frame_type == kFrameKey && frame_header_.show_frame) {
- decoder_state_.reference_valid.fill(false);
decoder_state_.reference_order_hint.fill(0);
+ decoder_state_.reference_frame.fill(nullptr);
}
OBU_READ_BIT_OR_FAIL;
frame_header_.enable_cdf_update = !static_cast<bool>(scratch);
@@ -1890,27 +1904,28 @@
frame_header_.current_frame_id = static_cast<uint16_t>(scratch);
const int previous_frame_id = decoder_state_.current_frame_id;
decoder_state_.current_frame_id = frame_header_.current_frame_id;
- if ((frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) &&
- previous_frame_id >= 0) {
- // Section 6.8.2: ..., it is a requirement of bitstream conformance
- // that all of the following conditions are true:
- // * current_frame_id is not equal to PrevFrameID,
- // * DiffFrameID is less than 1 << ( idLen - 1 )
- int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
- const int id_length_max_value = 1
- << sequence_header_.frame_id_length_bits;
- if (diff_frame_id <= 0) {
- diff_frame_id += id_length_max_value;
+ if (frame_header_.frame_type != kFrameKey || !frame_header_.show_frame) {
+ if (previous_frame_id >= 0) {
+ // Section 6.8.2: ..., it is a requirement of bitstream conformance
+ // that all of the following conditions are true:
+ // * current_frame_id is not equal to PrevFrameID,
+ // * DiffFrameID is less than 1 << ( idLen - 1 )
+ int diff_frame_id = decoder_state_.current_frame_id - previous_frame_id;
+ const int id_length_max_value =
+ 1 << sequence_header_.frame_id_length_bits;
+ if (diff_frame_id <= 0) {
+ diff_frame_id += id_length_max_value;
+ }
+ if (diff_frame_id >= DivideBy2(id_length_max_value)) {
+ LIBGAV1_DLOG(ERROR,
+ "current_frame_id (%d) equals or differs too much from "
+ "previous_frame_id (%d).",
+ decoder_state_.current_frame_id, previous_frame_id);
+ return false;
+ }
}
- if (diff_frame_id >= DivideBy2(id_length_max_value)) {
- LIBGAV1_DLOG(ERROR,
- "current_frame_id (%d) equals or differs too much from "
- "previous_frame_id (%d).",
- decoder_state_.current_frame_id, previous_frame_id);
- return false;
- }
+ MarkInvalidReferenceFrames();
}
- MarkInvalidReferenceFrames();
} else {
frame_header_.current_frame_id = 0;
decoder_state_.current_frame_id = frame_header_.current_frame_id;
@@ -2008,15 +2023,8 @@
// Note if support for Annex C: Error resilience behavior is added this
// check should be omitted per C.5 Decoder consequences of processable
// frames.
- if (!decoder_state_.reference_valid[reference_frame_index]) {
- LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
- reference_frame_index);
- return false;
- }
- // Check if the inter frame requests a nonexistent reference, whether or
- // not frame_refs_short_signaling is used.
if (decoder_state_.reference_frame[reference_frame_index] == nullptr) {
- LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not a decoded frame.", i,
+ LIBGAV1_DLOG(ERROR, "ref_frame_idx[%d] (%d) is not valid.", i,
reference_frame_index);
return false;
}
@@ -2032,12 +2040,8 @@
// Section 6.8.2: It is a requirement of bitstream conformance that
// whenever expectedFrameId[ i ] is calculated, the value matches
// RefFrameId[ ref_frame_idx[ i ] ] ...
- //
- // Section 6.8.2: It is a requirement of bitstream conformance that
- // RefValid[ ref_frame_idx[ i ] ] is equal to 1, ...
if (frame_header_.expected_frame_id[i] !=
- decoder_state_.reference_frame_id[reference_frame_index] ||
- !decoder_state_.reference_valid[reference_frame_index]) {
+ decoder_state_.reference_frame_id[reference_frame_index]) {
LIBGAV1_DLOG(ERROR,
"Reference buffer %d has a frame id number mismatch.",
reference_frame_index);
@@ -2045,20 +2049,6 @@
}
}
}
- // Validate frame_header_.primary_reference_frame.
- if (frame_header_.primary_reference_frame != kPrimaryReferenceNone) {
- const int index =
- frame_header_
- .reference_frame_index[frame_header_.primary_reference_frame];
- if (decoder_state_.reference_frame[index] == nullptr) {
- LIBGAV1_DLOG(ERROR,
- "primary_ref_frame is %d but ref_frame_idx[%d] (%d) is "
- "not a decoded frame.",
- frame_header_.primary_reference_frame,
- frame_header_.primary_reference_frame, index);
- return false;
- }
- }
if (frame_header_.frame_size_override_flag &&
!frame_header_.error_resilient_mode) {
// Section 5.9.7.
@@ -2668,6 +2658,7 @@
metadata_ = {};
tile_buffers_.clear();
next_tile_group_start_ = 0;
+ sequence_header_changed_ = false;
bool parsed_one_full_frame = false;
bool seen_frame_header = false;
diff --git a/libgav1/src/obu_parser.h b/libgav1/src/obu_parser.h
index 22a2396..c4619ed 100644
--- a/libgav1/src/obu_parser.h
+++ b/libgav1/src/obu_parser.h
@@ -276,6 +276,9 @@
const ObuFrameHeader& frame_header() const { return frame_header_; }
const Vector<TileBuffer>& tile_buffers() const { return tile_buffers_; }
const ObuMetadata& metadata() const { return metadata_; }
+ // Returns true if the last call to ParseOneFrame() encountered a sequence
+ // header change.
+ bool sequence_header_changed() const { return sequence_header_changed_; }
// Setters.
void set_sequence_header(const ObuSequenceHeader& sequence_header) {
@@ -284,7 +287,7 @@
}
// Moves |tile_buffers_| into |tile_buffers|.
- void MoveTileBuffer(Vector<TileBuffer>* tile_buffers) {
+ void MoveTileBuffers(Vector<TileBuffer>* tile_buffers) {
*tile_buffers = std::move(tile_buffers_);
}
@@ -362,7 +365,8 @@
// ParseMetadata() can find the trailing bit of the OBU and either extract
// or skip over the payload data as an opaque chunk of data.
bool ParseMetadata(const uint8_t* data, size_t size); // 5.8.
- // Adds and populates the TileBuffer for each tile in the tile group.
+ // Adds and populates the TileBuffer for each tile in the tile group and
+ // updates |next_tile_group_start_|
bool AddTileBuffers(int start, int end, size_t total_size,
size_t tg_header_size, size_t bytes_consumed_so_far);
bool ParseTileGroup(size_t size, size_t bytes_consumed_so_far); // 5.11.1.
@@ -383,6 +387,9 @@
int next_tile_group_start_ = 0;
// If true, the sequence_header_ field is valid.
bool has_sequence_header_ = false;
+ // If true, it means that the last call to ParseOneFrame() encountered a
+ // sequence header change.
+ bool sequence_header_changed_ = false;
// If true, the obu_extension_flag syntax element in the OBU header must be
// 0. Set to true when parsing a sequence header if OperatingPointIdc is 0.
bool extension_disallowed_ = false;
diff --git a/libgav1/src/post_filter.h b/libgav1/src/post_filter.h
index d300049..dfcd08e 100644
--- a/libgav1/src/post_filter.h
+++ b/libgav1/src/post_filter.h
@@ -58,7 +58,7 @@
// The overall flow of data in this class (for both single and multi-threaded
// cases) is as follows:
// -> Input: |frame_buffer_|.
- // -> Initialize |source_buffer_|, |cdef_buffer_| and
+ // -> Initialize |source_buffer_|, |cdef_buffer_|, |superres_buffer_| and
// |loop_restoration_buffer_|.
// -> Deblocking:
// * Input: |source_buffer_|
@@ -68,9 +68,9 @@
// * Output: |cdef_buffer_|
// -> SuperRes:
// * Input: |cdef_buffer_|
- // * Output: |cdef_buffer_|
+ // * Output: |superres_buffer_|
// -> Loop Restoration:
- // * Input: |cdef_buffer_|
+ // * Input: |superres_buffer_|
// * Output: |loop_restoration_buffer_|.
// -> Now |frame_buffer_| contains the filtered frame.
PostFilter(const ObuFrameHeader& frame_header,
@@ -102,18 +102,20 @@
// Filter behavior (multi-threaded):
// * Deblock: In-place filtering. The output is written to |source_buffer_|.
// If cdef and loop restoration are both on, then 4 rows (as
- // specified by |kDeblockedRowsForLoopRestoration|) in every 64x64
- // block is copied into |deblock_buffer_|.
- // * Cdef: Filtering output is written into |threaded_window_buffer_| and then
- // copied into the |cdef_buffer_| (which is just |source_buffer_| with
- // a shift to the top-left).
- // * SuperRes: Near in-place filtering (with an additional line buffer for
- // each row). The output is written to |cdef_buffer_|.
- // * Restoration: Uses the |cdef_buffer_| and |deblock_buffer_| as the input
- // and the output is written into the
- // |threaded_window_buffer_|. It is then copied to the
- // |loop_restoration_buffer_| (which is just |cdef_buffer_|
- // with a shift to the top-left).
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
+ // * Cdef: In-place filtering. Uses the |source_buffer_| and |cdef_border_| as
+ // the input and the output is written into |cdef_buffer_| (which is
+ // the same as |source_buffer_|).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| and
+ // |superres_line_buffer_| as the input and the output is written
+ // into |superres_buffer_| (which is just |cdef_buffer_| with a
+ // shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left).
void ApplyFilteringThreaded();
// Does the overall post processing filter for one superblock row starting at
@@ -123,17 +125,18 @@
// Filter behavior (single-threaded):
// * Deblock: In-place filtering. The output is written to |source_buffer_|.
// If cdef and loop restoration are both on, then 4 rows (as
- // specified by |kDeblockedRowsForLoopRestoration|) in every 64x64
- // block is copied into |deblock_buffer_|.
+ // specified by |kLoopRestorationBorderRows|) in every 64x64 block
+ // is copied into |loop_restoration_border_|.
// * Cdef: In-place filtering. The output is written into |cdef_buffer_|
// (which is just |source_buffer_| with a shift to the top-left).
- // * SuperRes: Near in-place filtering (with an additional line buffer for
- // each row). The output is written to |cdef_buffer_|.
- // * Restoration: Near in-place filtering. Uses a local block of size 64x64.
- // Uses the |cdef_buffer_| and |deblock_buffer_| as the input
- // and the output is written into |loop_restoration_buffer_|
- // (which is just |source_buffer_| with a shift to the
- // top-left).
+ // * SuperRes: Near in-place filtering. Uses the |cdef_buffer_| as the input
+ // and the output is written into |superres_buffer_| (which is
+ // just |cdef_buffer_| with a shift to the top).
+ // * Restoration: Near in-place filtering.
+ // Uses the |superres_buffer_| and |loop_restoration_border_|
+ // as the input and the output is written into
+ // |loop_restoration_buffer_| (which is just |superres_buffer_|
+ // with a shift to the left or top-left).
// Returns the index of the last row whose post processing is complete and can
// be used for referencing.
int ApplyFilteringForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row,
@@ -170,25 +173,6 @@
return DoDeblock(frame_header_, do_post_filter_mask_);
}
- // This function takes the cdef filtered buffer and the deblocked buffer to
- // prepare a block as input for loop restoration.
- // In striped loop restoration:
- // The filtering needs to fetch the area of size (width + 6) x (height + 4),
- // in which (width + 6) x height area is from upscaled frame
- // (superres_buffer). Top 2 rows and bottom 2 rows are from deblocked frame
- // (deblock_buffer). Special cases are: (1). when it is the top border, the
- // top 2 rows are from cdef filtered frame. (2). when it is the bottom border,
- // the bottom 2 rows are from cdef filtered frame. This function is called
- // only when cdef is applied for this frame.
- template <typename Pixel>
- static void PrepareLoopRestorationBlock(const Pixel* src_buffer,
- ptrdiff_t src_stride,
- const Pixel* deblock_buffer,
- ptrdiff_t deblock_stride, Pixel* dst,
- ptrdiff_t dst_stride, int width,
- int height, bool frame_top_border,
- bool frame_bottom_border);
-
uint8_t GetZeroDeltaDeblockFilterLevel(int segment_id, int level_index,
ReferenceFrameType type,
int mode_id) const {
@@ -235,36 +219,21 @@
}
LoopRestorationInfo* restoration_info() const { return restoration_info_; }
uint8_t* GetBufferOffset(uint8_t* base_buffer, int stride, Plane plane,
- int row4x4, int column4x4) const {
- return base_buffer +
- RowOrColumn4x4ToPixel(row4x4, plane, subsampling_y_[plane]) *
- stride +
- RowOrColumn4x4ToPixel(column4x4, plane, subsampling_x_[plane]) *
- pixel_size_;
+ int row, int column) const {
+ return base_buffer + (row >> subsampling_y_[plane]) * stride +
+ ((column >> subsampling_x_[plane]) << pixel_size_log2_);
}
uint8_t* GetSourceBuffer(Plane plane, int row4x4, int column4x4) const {
return GetBufferOffset(source_buffer_[plane], frame_buffer_.stride(plane),
- plane, row4x4, column4x4);
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
}
-
- static int GetWindowBufferWidth(const ThreadPool* const thread_pool,
- const ObuFrameHeader& frame_header) {
- return (thread_pool == nullptr) ? 0
- : Align(frame_header.upscaled_width, 64);
+ uint8_t* GetCdefBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
}
-
- // For multi-threaded cdef and loop restoration, window height is the minimum
- // of the following two quantities:
- // 1) thread_count * 64
- // 2) frame_height rounded up to the nearest power of 64
- // Where 64 is the block size for cdef and loop restoration.
- static int GetWindowBufferHeight(const ThreadPool* const thread_pool,
- const ObuFrameHeader& frame_header) {
- if (thread_pool == nullptr) return 0;
- const int thread_count = 1 + thread_pool->num_threads();
- const int window_height = MultiplyBy64(thread_count);
- const int adjusted_frame_height = Align(frame_header.height, 64);
- return std::min(adjusted_frame_height, window_height);
+ uint8_t* GetSuperResBuffer(Plane plane, int row4x4, int column4x4) const {
+ return GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ plane, MultiplyBy4(row4x4), MultiplyBy4(column4x4));
}
template <typename Pixel>
@@ -302,8 +271,13 @@
// updated.
void CopyBordersForOneSuperBlockRow(int row4x4, int sb4x4,
bool for_loop_restoration);
- // Sets up the |deblock_buffer_| for loop restoration.
- void SetupDeblockBuffer(int row4x4_start, int sb4x4);
+ // Sets up the |loop_restoration_border_| for loop restoration.
+ // This is called when there is no CDEF filter. We copy rows from
+ // |superres_buffer_| and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start);
+ // This is called when there is CDEF filter. We copy rows from
+ // |source_buffer_|, apply superres and do the line extension.
+ void SetupLoopRestorationBorder(int row4x4_start, int sb4x4);
// Returns true if we can perform border extension in loop (i.e.) without
// waiting until the entire frame is decoded. If intra_block_copy is true, we
// do in-loop border extension only if the upscaled_width is the same as 4 *
@@ -317,13 +291,21 @@
template <typename Pixel>
void CopyPlane(const Pixel* src, ptrdiff_t src_stride, int width, int height,
Pixel* dst, ptrdiff_t dst_stride) {
- for (int y = 0; y < height; ++y) {
+ assert(height > 0);
+ do {
memcpy(dst, src, width * sizeof(Pixel));
src += src_stride;
dst += dst_stride;
- }
+ } while (--height != 0);
}
+ // Worker function used for multi-threaded implementation of Deblocking, CDEF
+ // and Loop Restoration.
+ using WorkerFunction = void (PostFilter::*)(std::atomic<int>* row4x4_atomic);
+ // Schedules |worker| jobs to the |thread_pool_|, runs them in the calling
+ // thread and returns once all the jobs are completed.
+ void RunJobs(WorkerFunction worker);
+
// Functions for the Deblocking filter.
static int GetIndex(int row4x4) { return DivideBy4(row4x4); }
@@ -361,16 +343,25 @@
// Applies deblock filtering for the superblock row starting at |row4x4| with
// a height of 4*|sb4x4|.
void ApplyDeblockFilterForOneSuperBlockRow(int row4x4, int sb4x4);
- void DeblockFilterWorker(int jobs_per_plane, const Plane* planes,
- int num_planes, std::atomic<int>* job_counter,
- DeblockFilter deblock_filter);
- void ApplyDeblockFilterThreaded();
+ // Worker function used for multi-threaded deblocking.
+ template <LoopFilterType loop_filter_type>
+ void DeblockFilterWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>),
+ WorkerFunction>::value,
+ "");
+ static_assert(
+ std::is_same<
+ decltype(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>),
+ WorkerFunction>::value,
+ "");
// Functions for the cdef filter.
- uint8_t* GetCdefBufferAndStride(int start_x, int start_y, int plane,
- int window_buffer_plane_size,
- int* cdef_stride) const;
+ // Copies the deblocked pixels necessary for use by the multi-threaded cdef
+ // implementation into |cdef_border_|.
+ void SetupCdefBorder(int row4x4);
// This function prepares the input source block for cdef filtering. The input
// source block contains a 12x12 block, with the inner 8x8 as the desired
// filter region. It pads the block if the 12x12 block includes out of frame
@@ -379,35 +370,43 @@
template <typename Pixel>
void PrepareCdefBlock(int block_width4x4, int block_height4x4, int row4x4,
int column4x4, uint16_t* cdef_source,
- ptrdiff_t cdef_stride, bool y_plane);
+ ptrdiff_t cdef_stride, bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns);
+ // Applies cdef for one 64x64 block.
template <typename Pixel>
void ApplyCdefForOneUnit(uint16_t* cdef_block, int index, int block_width4x4,
int block_height4x4, int row4x4_start,
- int column4x4_start);
+ int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]);
// Helper function used by ApplyCdefForOneSuperBlockRow to avoid some code
// duplication.
- void ApplyCdefForOneSuperBlockRowHelper(int row4x4, int block_height4x4);
- // Applies cdef filtering for the superblock row starting at |row4x4| with a
+ void ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4);
+ // Applies CDEF filtering for the superblock row starting at |row4x4| with a
// height of 4*|sb4x4|.
void ApplyCdefForOneSuperBlockRow(int row4x4, int sb4x4, bool is_last_row);
- template <typename Pixel>
- void ApplyCdefForOneRowInWindow(int row, int column);
- template <typename Pixel>
- void ApplyCdefThreaded();
- void ApplyCdef(); // Sections 7.15 and 7.15.1.
+ // Worker function used for multi-threaded CDEF.
+ void ApplyCdefWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyCdefWorker),
+ WorkerFunction>::value,
+ "");
// Functions for the SuperRes filter.
- // Applies super resolution for the |buffers| for |rows[plane]| rows of each
- // plane. If |in_place| is true, the line buffer will not be used and the
- // SuperRes output will be written to a row above the input row. If |in_place|
- // is false, the line buffer will be used to store a copy of the input and the
- // output will be written to the same row as the input row.
- template <bool in_place>
- void ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
- const std::array<int, kMaxPlanes>& strides,
- const std::array<int, kMaxPlanes>& rows,
- size_t line_buffer_offset); // Section 7.16.
+ // Applies super resolution for the |src| for |rows[plane]| rows of each
+ // plane. If |line_buffer_row| is larger than or equal to 0, one more row will
+ // be processed, the line buffer indicated by |line_buffer_row| will be used
+ // as the source. If |dst_is_loop_restoration_border| is true, then it means
+ // that the |dst| pointers come from |loop_restoration_border_| and the
+ // strides will be populated from that buffer.
+ void ApplySuperRes(
+ const std::array<uint8_t*, kMaxPlanes>& src,
+ const std::array<int, kMaxPlanes>& rows, int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border = false); // Section 7.16.
// Applies SuperRes for the superblock row starting at |row4x4| with a height
// of 4*|sb4x4|.
void ApplySuperResForOneSuperBlockRow(int row4x4, int sb4x4,
@@ -416,22 +415,7 @@
// Functions for the Loop Restoration filter.
- template <typename Pixel>
- void ApplyLoopRestorationForOneRowInWindow(
- const Pixel* src_buffer, Plane plane, int plane_height, int plane_width,
- int y, int x, int row, int unit_row, int current_process_unit_height,
- int plane_unit_size, int window_width,
- Array2DView<Pixel>* loop_restored_window);
- // Applies loop restoration for the superblock row starting at |row4x4_start|
- // with a height of 4*|sb4x4|.
- template <typename Pixel>
- void ApplyLoopRestorationSingleThread(int row4x4_start, int sb4x4);
- void ApplyLoopRestoration(int row4x4_start, int sb4x4);
- template <typename Pixel>
- void ApplyLoopRestorationThreaded();
- // Note for ApplyLoopRestoration():
- // First, we must differentiate loop restoration processing unit from loop
- // restoration unit.
+ // Notes about Loop Restoration:
// (1). Loop restoration processing unit size is default to 64x64.
// Only when the remaining filtering area is smaller than 64x64, the
// processing unit size is the actual area size.
@@ -460,7 +444,26 @@
// then sizes of the first row of processing units are 64x56, 64x56, 12x56,
// respectively. The second row is 64x64, 64x64, 12x64.
// The third row is 64x20, 64x20, 12x20.
- void ApplyLoopRestoration();
+
+ // |stride| is shared by |src_buffer| and |dst_buffer|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneRow(const Pixel* src_buffer, ptrdiff_t stride,
+ Plane plane, int plane_height,
+ int plane_width, int y, int unit_row,
+ int current_process_unit_height,
+ int plane_unit_size, Pixel* dst_buffer);
+ // Applies loop restoration for the superblock row starting at |row4x4_start|
+ // with a height of 4*|sb4x4|.
+ template <typename Pixel>
+ void ApplyLoopRestorationForOneSuperBlockRow(int row4x4_start, int sb4x4);
+ // Helper function that calls the right variant of
+ // ApplyLoopRestorationForOneSuperBlockRow based on the bitdepth.
+ void ApplyLoopRestoration(int row4x4_start, int sb4x4);
+ // Worker function used for multithreaded Loop Restoration.
+ void ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic);
+ static_assert(std::is_same<decltype(&PostFilter::ApplyLoopRestorationWorker),
+ WorkerFunction>::value,
+ "");
const ObuFrameHeader& frame_header_;
const LoopRestoration& loop_restoration_;
@@ -473,7 +476,7 @@
const int8_t subsampling_x_[kMaxPlanes];
const int8_t subsampling_y_[kMaxPlanes];
const int8_t planes_;
- const int pixel_size_;
+ const int pixel_size_log2_;
const uint8_t* const inner_thresh_;
const uint8_t* const outer_thresh_;
const bool needs_chroma_deblock_;
@@ -491,18 +494,13 @@
} super_res_info_[kMaxPlanes];
const Array2D<int16_t>& cdef_index_;
const Array2D<TransformSize>& inter_transform_sizes_;
- // Pointer to the data buffer used for multi-threaded cdef or loop
- // restoration. The size of this buffer must be at least
- // |window_buffer_width_| * |window_buffer_height_| * |pixel_size_|.
- // Or |planes_| times that for multi-threaded cdef.
- // If |thread_pool_| is nullptr, then this buffer is not used and can be
- // nullptr as well.
- uint8_t* const threaded_window_buffer_;
LoopRestorationInfo* const restoration_info_;
- // Pointer to the line buffer used by ApplySuperRes(). If SuperRes is on, then
- // the buffer will be large enough to hold one downscaled row +
- // 2 * kSuperResHorizontalBorder + kSuperResHorizontalPadding.
- uint8_t* const superres_line_buffer_;
+ uint8_t* const superres_coefficients_[kNumPlaneTypes];
+ // Line buffer used by multi-threaded ApplySuperRes().
+ // In the multi-threaded case, this buffer will store the last downscaled row
+ // input of each thread to avoid overwrites by the first upscaled row output
+ // of the thread below it.
+ YuvBuffer& superres_line_buffer_;
const BlockParametersHolder& block_parameters_;
// Frame buffer to hold cdef filtered frame.
YuvBuffer cdef_filtered_buffer_;
@@ -520,24 +518,27 @@
// A view into |frame_buffer_| that points to the output of the Loop Restored
// planes (to facilitate in-place Loop Restoration).
uint8_t* loop_restoration_buffer_[kMaxPlanes];
- // Buffer used to store the deblocked pixels that are necessary for loop
+ YuvBuffer& cdef_border_;
+ // Buffer used to store the border pixels that are necessary for loop
// restoration. This buffer will store 4 rows for every 64x64 block (4 rows
// for every 32x32 for chroma with subsampling). The indices of the rows that
- // are stored are specified in |kDeblockedRowsForLoopRestoration|. First 4
- // rows of this buffer are never populated and never used.
- // This buffer is used only when both Cdef and Loop Restoration are on.
- YuvBuffer& deblock_buffer_;
+ // are stored are specified in |kLoopRestorationBorderRows|. First 4 rows of
+ // this buffer are never populated and never used.
+ // This buffer is used only when both of the following conditions are true:
+ // (1). Loop Restoration is on.
+ // (2). Cdef is on, or multi-threading is enabled for post filter.
+ YuvBuffer& loop_restoration_border_;
const uint8_t do_post_filter_mask_;
ThreadPool* const thread_pool_;
- const int window_buffer_width_;
- const int window_buffer_height_;
// Tracks the progress of the post filters.
int progress_row_ = -1;
// A block buffer to hold the input that is converted to uint16_t before
- // cdef filtering. Only used in single threaded case.
- uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
+ // cdef filtering. Only used in single threaded case. Y plane is processed
+ // separately. U and V planes are processed together. So it is sufficient to
+ // have this buffer to accommodate 2 planes at a time.
+ uint16_t cdef_block_[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
template <int bitdepth, typename Pixel>
friend class PostFilterSuperResTest;
@@ -551,11 +552,6 @@
ptrdiff_t stride,
int left, int right,
int top, int bottom);
-extern template void PostFilter::PrepareLoopRestorationBlock<uint8_t>(
- const uint8_t* src_buffer, ptrdiff_t src_stride,
- const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dst,
- ptrdiff_t dst_stride, const int width, const int height,
- const bool frame_top_border, const bool frame_bottom_border);
#if LIBGAV1_MAX_BITDEPTH >= 10
extern template void PostFilter::ExtendFrame<uint16_t>(uint16_t* frame_start,
@@ -563,11 +559,6 @@
ptrdiff_t stride,
int left, int right,
int top, int bottom);
-extern template void PostFilter::PrepareLoopRestorationBlock<uint16_t>(
- const uint16_t* src_buffer, ptrdiff_t src_stride,
- const uint16_t* deblock_buffer, ptrdiff_t deblock_stride, uint16_t* dst,
- ptrdiff_t dst_stride, const int width, const int height,
- const bool frame_top_border, const bool frame_bottom_border);
#endif
} // namespace libgav1
diff --git a/libgav1/src/post_filter/cdef.cc b/libgav1/src/post_filter/cdef.cc
index 9b6bb00..f32b0a0 100644
--- a/libgav1/src/post_filter/cdef.cc
+++ b/libgav1/src/post_filter/cdef.cc
@@ -26,16 +26,20 @@
{{0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 2, 2, 3, 4, 6, 0}},
{{7, 0, 2, 4, 5, 6, 6, 6}, {0, 1, 2, 3, 4, 5, 6, 7}}};
+constexpr int kCdefBorderRows[2][4] = {{0, 1, 62, 63}, {0, 1, 30, 31}};
+
template <typename Pixel>
void CopyRowForCdef(const Pixel* src, int block_width, int unit_width,
bool is_frame_left, bool is_frame_right,
- uint16_t* const dst) {
+ uint16_t* const dst, const Pixel* left_border = nullptr) {
if (sizeof(src[0]) == sizeof(dst[0])) {
if (is_frame_left) {
Memset(dst - kCdefBorder, kCdefLargeValue, kCdefBorder);
- } else {
+ } else if (left_border == nullptr) {
memcpy(dst - kCdefBorder, src - kCdefBorder,
kCdefBorder * sizeof(dst[0]));
+ } else {
+ memcpy(dst - kCdefBorder, left_border, kCdefBorder * sizeof(dst[0]));
}
memcpy(dst, src, block_width * sizeof(dst[0]));
if (is_frame_right) {
@@ -47,8 +51,18 @@
}
return;
}
- for (int x = -kCdefBorder; x < 0; ++x) {
- dst[x] = is_frame_left ? static_cast<uint16_t>(kCdefLargeValue) : src[x];
+ if (is_frame_left) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = static_cast<uint16_t>(kCdefLargeValue);
+ }
+ } else if (left_border == nullptr) {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = src[x];
+ }
+ } else {
+ for (int x = -kCdefBorder; x < 0; ++x) {
+ dst[x] = left_border[x + kCdefBorder];
+ }
}
for (int x = 0; x < block_width; ++x) {
dst[x] = src[x];
@@ -72,38 +86,48 @@
} // namespace
-uint8_t* PostFilter::GetCdefBufferAndStride(const int start_x,
- const int start_y, const int plane,
- const int window_buffer_plane_size,
- int* cdef_stride) const {
- if (thread_pool_ != nullptr) {
- // write output to threaded_window_buffer.
- *cdef_stride = window_buffer_width_ * pixel_size_;
- const int column_window =
- start_x % (window_buffer_width_ >> subsampling_x_[plane]);
- const int row_window =
- start_y % (window_buffer_height_ >> subsampling_y_[plane]);
- return threaded_window_buffer_ + plane * window_buffer_plane_size +
- row_window * (*cdef_stride) + column_window * pixel_size_;
- }
- // write output to |cdef_buffer_|.
- *cdef_stride = frame_buffer_.stride(plane);
- return cdef_buffer_[plane] + start_y * (*cdef_stride) + start_x * pixel_size_;
+void PostFilter::SetupCdefBorder(int row4x4) {
+ assert(row4x4 >= 0);
+ assert(DoCdef());
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t dst_stride = cdef_border_.stride(plane);
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels = SubsampledValue(
+ MultiplyBy4(frame_header_.columns4x4), subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(MultiplyBy4(frame_header_.rows4x4),
+ subsampling_y_[plane]);
+ for (int i = 0; i < 4; ++i) {
+ const int row = kCdefBorderRows[subsampling_y_[plane]][i];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ if (absolute_row >= plane_height) break;
+ const uint8_t* src =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ uint8_t* dst = cdef_border_.data(plane) + dst_stride * (row_offset + i);
+ memcpy(dst, src, row_width);
+ }
+ } while (++plane < planes_);
}
template <typename Pixel>
void PostFilter::PrepareCdefBlock(int block_width4x4, int block_height4x4,
int row4x4, int column4x4,
uint16_t* cdef_source, ptrdiff_t cdef_stride,
- const bool y_plane) {
+ const bool y_plane,
+ const uint8_t border_columns[kMaxPlanes][256],
+ bool use_border_columns) {
assert(y_plane || planes_ == kMaxPlanes);
const int max_planes = y_plane ? 1 : kMaxPlanes;
const int8_t subsampling_x = y_plane ? 0 : subsampling_x_[kPlaneU];
const int8_t subsampling_y = y_plane ? 0 : subsampling_y_[kPlaneU];
const int start_x = MultiplyBy4(column4x4) >> subsampling_x;
const int start_y = MultiplyBy4(row4x4) >> subsampling_y;
- const int plane_width = RightShiftWithRounding(width_, subsampling_x);
- const int plane_height = RightShiftWithRounding(height_, subsampling_y);
+ const int plane_width = SubsampledValue(width_, subsampling_x);
+ const int plane_height = SubsampledValue(height_, subsampling_y);
const int block_width = MultiplyBy4(block_width4x4) >> subsampling_x;
const int block_height = MultiplyBy4(block_height4x4) >> subsampling_y;
// unit_width, unit_height are the same as block_width, block_height unless
@@ -117,20 +141,33 @@
const bool is_frame_top = row4x4 == 0;
const bool is_frame_bottom = start_y + block_height >= plane_height;
const int y_offset = is_frame_top ? 0 : kCdefBorder;
+ const int cdef_border_row_offset = DivideBy4(row4x4) - (is_frame_top ? 0 : 2);
for (int plane = y_plane ? kPlaneY : kPlaneU; plane < max_planes; ++plane) {
- uint16_t* cdef_src = cdef_source + plane * kCdefUnitSizeWithBorders *
+ uint16_t* cdef_src = cdef_source + static_cast<int>(plane == kPlaneV) *
+ kCdefUnitSizeWithBorders *
kCdefUnitSizeWithBorders;
const int src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
const Pixel* src_buffer =
reinterpret_cast<const Pixel*>(source_buffer_[plane]) +
(start_y - y_offset) * src_stride + start_x;
+ const int cdef_border_stride = cdef_border_.stride(plane) / sizeof(Pixel);
+ const Pixel* cdef_border =
+ (thread_pool_ == nullptr)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(cdef_border_.data(plane)) +
+ cdef_border_row_offset * cdef_border_stride + start_x;
// All the copying code will use negative indices for populating the left
// border. So the starting point is set to kCdefBorder.
cdef_src += kCdefBorder;
- // Copy the top 2 rows.
+ // Copy the top 2 rows as follows;
+ // If is_frame_top is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
if (is_frame_top) {
for (int y = 0; y < kCdefBorder; ++y) {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
@@ -138,24 +175,63 @@
cdef_src += cdef_stride;
}
} else {
+ const Pixel* top_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int top_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
for (int y = 0; y < kCdefBorder; ++y) {
- CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+ CopyRowForCdef(top_border, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
- src_buffer += src_stride;
+ top_border += top_border_stride;
cdef_src += cdef_stride;
+ // We need to increment |src_buffer| and |cdef_border| in this loop to
+ // set them up for the subsequent loops below.
+ src_buffer += src_stride;
+ cdef_border += cdef_border_stride;
}
}
- // Copy the body.
+ // Copy the body as follows;
+ // If multi-threaded filtering is off or if is_frame_bottom is true, all the
+ // rows are copied from |src_buffer|.
+ // Otherwise, the first |block_height|-kCdefBorder rows are copied from
+ // |src_buffer| and the last kCdefBorder rows are coped from |cdef_border|.
int y = block_height;
+ const int y_threshold =
+ (thread_pool_ == nullptr || is_frame_bottom) ? 0 : kCdefBorder;
+ const Pixel* left_border =
+ (thread_pool_ == nullptr || !use_border_columns)
+ ? nullptr
+ : reinterpret_cast<const Pixel*>(border_columns[plane]);
do {
CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
- is_frame_right, cdef_src);
+ is_frame_right, cdef_src, left_border);
cdef_src += cdef_stride;
src_buffer += src_stride;
- } while (--y != 0);
+ if (left_border != nullptr) left_border += kCdefBorder;
+ } while (--y != y_threshold);
- // Copy the bottom 2 rows.
+ if (y > 0) {
+ assert(y == kCdefBorder);
+ // |cdef_border| now points to the top 2 rows of the current block. For
+ // the next loop, we need it to point to the bottom 2 rows of the
+ // current block. So increment it by 2 rows.
+ cdef_border += MultiplyBy2(cdef_border_stride);
+ for (int i = 0; i < kCdefBorder; ++i) {
+ CopyRowForCdef(cdef_border, block_width, unit_width, is_frame_left,
+ is_frame_right, cdef_src);
+ cdef_src += cdef_stride;
+ cdef_border += cdef_border_stride;
+ }
+ }
+
+ // Copy the bottom 2 rows as follows;
+ // If is_frame_bottom is true, both the rows are set to kCdefLargeValue.
+ // Otherwise:
+ // If multi-threaded filtering is off, the rows are copied from
+ // |src_buffer|.
+ // Otherwise, the rows are copied from |cdef_border|.
+ y = 0;
if (is_frame_bottom) {
do {
Memset(cdef_src - kCdefBorder, kCdefLargeValue,
@@ -163,10 +239,14 @@
cdef_src += cdef_stride;
} while (++y < kCdefBorder + unit_height - block_height);
} else {
+ const Pixel* bottom_border =
+ (thread_pool_ == nullptr) ? src_buffer : cdef_border;
+ const int bottom_border_stride =
+ (thread_pool_ == nullptr) ? src_stride : cdef_border_stride;
do {
- CopyRowForCdef(src_buffer, block_width, unit_width, is_frame_left,
+ CopyRowForCdef(bottom_border, block_width, unit_width, is_frame_left,
is_frame_right, cdef_src);
- src_buffer += src_stride;
+ bottom_border += bottom_border_stride;
cdef_src += cdef_stride;
} while (++y < kCdefBorder + unit_height - block_height);
}
@@ -178,54 +258,91 @@
const int block_width4x4,
const int block_height4x4,
const int row4x4_start,
- const int column4x4_start) {
+ const int column4x4_start,
+ uint8_t border_columns[2][kMaxPlanes][256],
+ bool use_border_columns[2][2]) {
// Cdef operates in 8x8 blocks (4x4 for chroma with subsampling).
static constexpr int kStep = 8;
static constexpr int kStep4x4 = 2;
- const int window_buffer_plane_size =
- window_buffer_width_ * window_buffer_height_ * sizeof(Pixel);
int cdef_buffer_row_base_stride[kMaxPlanes];
- int cdef_buffer_stride[kMaxPlanes];
uint8_t* cdef_buffer_row_base[kMaxPlanes];
int src_buffer_row_base_stride[kMaxPlanes];
const uint8_t* src_buffer_row_base[kMaxPlanes];
+ const uint16_t* cdef_src_row_base[kMaxPlanes];
+ int cdef_src_row_base_stride[kMaxPlanes];
int column_step[kMaxPlanes];
- assert(planes_ >= 1);
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- const int start_y = MultiplyBy4(row4x4_start) >> subsampling_y_[plane];
- const int start_x = MultiplyBy4(column4x4_start) >> subsampling_x_[plane];
- cdef_buffer_row_base[plane] = GetCdefBufferAndStride(
- start_x, start_y, plane, window_buffer_plane_size,
- &cdef_buffer_stride[plane]);
+ assert(planes_ == kMaxPlanesMonochrome || planes_ == kMaxPlanes);
+ int plane = kPlaneY;
+ do {
+ cdef_buffer_row_base[plane] =
+ GetCdefBuffer(static_cast<Plane>(plane), row4x4_start, column4x4_start);
cdef_buffer_row_base_stride[plane] =
- cdef_buffer_stride[plane] * (kStep >> subsampling_y_[plane]);
- src_buffer_row_base[plane] = source_buffer_[plane] +
- start_y * frame_buffer_.stride(plane) +
- start_x * sizeof(Pixel);
+ frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ src_buffer_row_base[plane] = GetSourceBuffer(static_cast<Plane>(plane),
+ row4x4_start, column4x4_start);
src_buffer_row_base_stride[plane] =
frame_buffer_.stride(plane) * (kStep >> subsampling_y_[plane]);
+ cdef_src_row_base[plane] =
+ cdef_block +
+ static_cast<int>(plane == kPlaneV) * kCdefUnitSizeWithBorders *
+ kCdefUnitSizeWithBorders +
+ kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
+ cdef_src_row_base_stride[plane] =
+ kCdefUnitSizeWithBorders * (kStep >> subsampling_y_[plane]);
column_step[plane] = (kStep >> subsampling_x_[plane]) * sizeof(Pixel);
- }
+ } while (++plane < planes_);
+
+ // |border_columns| contains two buffers. In each call to this function, we
+ // will use one of them as the "destination" for the current call. And the
+ // other one as the "source" for the current call (which would have been the
+ // "destination" of the previous call). We will use the src_index to populate
+ // the borders which were backed up in the previous call. We will use the
+ // dst_index to populate the borders to be used in the next call.
+ const int border_columns_src_index = DivideBy16(column4x4_start) & 1;
+ const int border_columns_dst_index = border_columns_src_index ^ 1;
if (index == -1) {
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
- cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
- MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
- MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
- sizeof(Pixel));
+ if (thread_pool_ == nullptr) {
+ int plane = kPlaneY;
+ do {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ } while (++plane < planes_);
}
+ use_border_columns[border_columns_dst_index][0] = false;
+ use_border_columns[border_columns_dst_index][1] = false;
return;
}
- PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
- column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
- true);
+ const bool is_frame_right =
+ MultiplyBy4(column4x4_start) + MultiplyBy4(block_width4x4) >= width_;
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ // Backup the last 2 columns for use in the next iteration.
+ use_border_columns[border_columns_dst_index][0] = true;
+ const uint8_t* src_line =
+ GetSourceBuffer(kPlaneY, row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ CopyPixels(src_line, frame_buffer_.stride(kPlaneY),
+ border_columns[border_columns_dst_index][kPlaneY],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4), sizeof(Pixel));
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, true,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][0]);
// Stored direction used during the u/v pass. If bit 3 is set, then block is
// a skip.
- int direction_y[8 * 8];
+ uint8_t direction_y[8 * 8];
int y_index = 0;
const uint8_t y_primary_strength =
@@ -248,14 +365,16 @@
do {
uint8_t* cdef_buffer_base = cdef_buffer_row_base[kPlaneY];
const uint8_t* src_buffer_base = src_buffer_row_base[kPlaneY];
+ const uint16_t* cdef_src_base = cdef_src_row_base[kPlaneY];
BlockParameters* const* bp0 = bp_row0_base;
BlockParameters* const* bp1 = bp_row1_base;
int column4x4 = column4x4_start;
do {
const int block_width = kStep;
const int block_height = kStep;
- const int cdef_stride = cdef_buffer_stride[kPlaneY];
+ const int cdef_stride = frame_buffer_.stride(kPlaneY);
uint8_t* const cdef_buffer = cdef_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
const int src_stride = frame_buffer_.stride(kPlaneY);
const uint8_t* const src_buffer = src_buffer_base;
@@ -264,16 +383,39 @@
if (skip) { // No cdef filtering.
direction_y[y_index] = kCdefSkip;
- CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
- block_width, block_height, sizeof(Pixel));
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
} else {
// Zero out residual skip flag.
direction_y[y_index] = 0;
int variance = 0;
if (compute_direction_and_variance) {
- dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
- &variance);
+ if (thread_pool_ == nullptr ||
+ row4x4 + kStep4x4 < row4x4_start + block_height4x4) {
+ dsp_.cdef_direction(src_buffer, src_stride, &direction_y[y_index],
+ &variance);
+ } else if (sizeof(Pixel) == 2) {
+ dsp_.cdef_direction(cdef_src, kCdefUnitSizeWithBorders * 2,
+ &direction_y[y_index], &variance);
+ } else {
+ // If we are in the last row4x4 for this unit, then the last two
+ // input rows have to come from |cdef_border_|. Since we already
+ // have |cdef_src| populated correctly, use that as the input
+ // for the direction process.
+ uint8_t direction_src[8][8];
+ const uint16_t* cdef_src_line = cdef_src;
+ for (auto& direction_src_line : direction_src) {
+ for (int i = 0; i < 8; ++i) {
+ direction_src_line[i] = cdef_src_line[i];
+ }
+ cdef_src_line += kCdefUnitSizeWithBorders;
+ }
+ dsp_.cdef_direction(direction_src, 8, &direction_y[y_index],
+ &variance);
+ }
}
const int direction =
(y_primary_strength == 0) ? 0 : direction_y[y_index];
@@ -283,16 +425,12 @@
(variance != 0)
? (y_primary_strength * (4 + variance_strength) + 8) >> 4
: 0;
-
if ((primary_strength | y_secondary_strength) == 0) {
- CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
- block_width, block_height, sizeof(Pixel));
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
} else {
- uint16_t* cdef_src =
- cdef_block + kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
- cdef_src +=
- (MultiplyBy4(row4x4 - row4x4_start)) * kCdefUnitSizeWithBorders +
- (MultiplyBy4(column4x4 - column4x4_start));
const int strength_index =
y_strength_index | (static_cast<int>(primary_strength == 0) << 1);
dsp_.cdef_filters[1][strength_index](
@@ -303,6 +441,7 @@
}
cdef_buffer_base += column_step[kPlaneY];
src_buffer_base += column_step[kPlaneY];
+ cdef_src_base += column_step[kPlaneY] / sizeof(Pixel);
bp0 += kStep4x4;
bp1 += kStep4x4;
@@ -312,6 +451,7 @@
cdef_buffer_row_base[kPlaneY] += cdef_buffer_row_base_stride[kPlaneY];
src_buffer_row_base[kPlaneY] += src_buffer_row_base_stride[kPlaneY];
+ cdef_src_row_base[kPlaneY] += cdef_src_row_base_stride[kPlaneY];
bp_row0_base += bp_stride;
bp_row1_base += bp_stride;
row4x4 += kStep4x4;
@@ -327,19 +467,41 @@
frame_header_.cdef.uv_secondary_strength[index];
if ((uv_primary_strength | uv_secondary_strength) == 0) {
- for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
- CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
- cdef_buffer_row_base[plane], cdef_buffer_stride[plane],
- MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
- MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
- sizeof(Pixel));
+ if (thread_pool_ == nullptr) {
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ CopyPixels(src_buffer_row_base[plane], frame_buffer_.stride(plane),
+ cdef_buffer_row_base[plane], frame_buffer_.stride(plane),
+ MultiplyBy4(block_width4x4) >> subsampling_x_[plane],
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
}
+ use_border_columns[border_columns_dst_index][1] = false;
return;
}
- PrepareCdefBlock<Pixel>(block_width4x4, block_height4x4, row4x4_start,
- column4x4_start, cdef_block, kCdefUnitSizeWithBorders,
- false);
+ if (!is_frame_right && thread_pool_ != nullptr) {
+ use_border_columns[border_columns_dst_index][1] = true;
+ for (int plane = kPlaneU; plane <= kPlaneV; ++plane) {
+ // Backup the last 2 columns for use in the next iteration.
+ const uint8_t* src_line =
+ GetSourceBuffer(static_cast<Plane>(plane), row4x4_start,
+ column4x4_start + block_width4x4) -
+ kCdefBorder * sizeof(Pixel);
+ CopyPixels(src_line, frame_buffer_.stride(plane),
+ border_columns[border_columns_dst_index][plane],
+ kCdefBorder * sizeof(Pixel), kCdefBorder,
+ MultiplyBy4(block_height4x4) >> subsampling_y_[plane],
+ sizeof(Pixel));
+ }
+ }
+
+ PrepareCdefBlock<Pixel>(
+ block_width4x4, block_height4x4, row4x4_start, column4x4_start,
+ cdef_block, kCdefUnitSizeWithBorders, false,
+ (border_columns != nullptr) ? border_columns[border_columns_src_index]
+ : nullptr,
+ use_border_columns[border_columns_src_index][1]);
// uv_strength_index is 0 for both primary and secondary strengths being
// non-zero, 1 for primary only, 2 for secondary only.
@@ -357,18 +519,22 @@
do {
uint8_t* cdef_buffer_base = cdef_buffer_row_base[plane];
const uint8_t* src_buffer_base = src_buffer_row_base[plane];
+ const uint16_t* cdef_src_base = cdef_src_row_base[plane];
int column4x4 = column4x4_start;
do {
- const int cdef_stride = cdef_buffer_stride[plane];
+ const int cdef_stride = frame_buffer_.stride(plane);
uint8_t* const cdef_buffer = cdef_buffer_base;
const int src_stride = frame_buffer_.stride(plane);
const uint8_t* const src_buffer = src_buffer_base;
+ const uint16_t* const cdef_src = cdef_src_base;
const bool skip = (direction_y[y_index] & kCdefSkip) != 0;
int dual_cdef = 0;
if (skip) { // No cdef filtering.
- CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
- block_width, block_height, sizeof(Pixel));
+ if (thread_pool_ == nullptr) {
+ CopyPixels(src_buffer, src_stride, cdef_buffer, cdef_stride,
+ block_width, block_height, sizeof(Pixel));
+ }
} else {
// Make sure block pair is not out of bounds.
if (column4x4 + (kStep4x4 * 2) <= column4x4_start + block_width4x4) {
@@ -396,13 +562,6 @@
}
}
- uint16_t* cdef_src = cdef_block + plane * kCdefUnitSizeWithBorders *
- kCdefUnitSizeWithBorders;
- cdef_src += kCdefBorder * kCdefUnitSizeWithBorders + kCdefBorder;
- cdef_src +=
- (MultiplyBy4(row4x4 - row4x4_start) >> subsampling_y) *
- kCdefUnitSizeWithBorders +
- (MultiplyBy4(column4x4 - column4x4_start) >> subsampling_x);
// Block width is 8 if either dual_cdef is true or subsampling_x == 0.
const int width_index = dual_cdef | (subsampling_x ^ 1);
dsp_.cdef_filters[width_index][uv_strength_index](
@@ -415,19 +574,23 @@
// so adjust the pointers and indexes for 2 blocks.
cdef_buffer_base += column_step[plane] << dual_cdef;
src_buffer_base += column_step[plane] << dual_cdef;
+ cdef_src_base += (column_step[plane] / sizeof(Pixel)) << dual_cdef;
column4x4 += kStep4x4 << dual_cdef;
y_index += 1 << dual_cdef;
} while (column4x4 < column4x4_start + block_width4x4);
cdef_buffer_row_base[plane] += cdef_buffer_row_base_stride[plane];
src_buffer_row_base[plane] += src_buffer_row_base_stride[plane];
+ cdef_src_row_base[plane] += cdef_src_row_base_stride[plane];
row4x4 += kStep4x4;
} while (row4x4 < row4x4_start + block_height4x4);
}
}
-void PostFilter::ApplyCdefForOneSuperBlockRowHelper(int row4x4,
- int block_height4x4) {
+void PostFilter::ApplyCdefForOneSuperBlockRowHelper(
+ uint16_t* cdef_block, uint8_t border_columns[2][kMaxPlanes][256],
+ int row4x4, int block_height4x4) {
+ bool use_border_columns[2][2] = {};
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += kStep64x64) {
const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
@@ -436,13 +599,15 @@
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
- ApplyCdefForOneUnit<uint16_t>(cdef_block_, index, block_width4x4,
- block_height4x4, row4x4, column4x4);
+ ApplyCdefForOneUnit<uint16_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
continue;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
- ApplyCdefForOneUnit<uint8_t>(cdef_block_, index, block_width4x4,
- block_height4x4, row4x4, column4x4);
+ ApplyCdefForOneUnit<uint8_t>(cdef_block, index, block_width4x4,
+ block_height4x4, row4x4, column4x4,
+ border_columns, use_border_columns);
}
}
@@ -461,7 +626,7 @@
// first iteration (y == 0).
if (row4x4 > 0 && (!is_last_row || y == 0)) {
assert(row4x4 >= 16);
- ApplyCdefForOneSuperBlockRowHelper(row4x4 - 2, 2);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4 - 2, 2);
}
// Apply cdef for the current superblock row. If this is the last superblock
@@ -471,101 +636,25 @@
std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
const int height4x4 = block_height4x4 - (is_last_row ? 0 : 2);
if (height4x4 > 0) {
- ApplyCdefForOneSuperBlockRowHelper(row4x4, height4x4);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block_, nullptr, row4x4,
+ height4x4);
}
}
}
-template <typename Pixel>
-void PostFilter::ApplyCdefForOneRowInWindow(const int row4x4,
- const int column4x4_start) {
- uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 3];
-
- for (int column4x4_64x64 = 0;
- column4x4_64x64 < std::min(DivideBy4(window_buffer_width_),
- frame_header_.columns4x4 - column4x4_start);
- column4x4_64x64 += kStep64x64) {
- const int column4x4 = column4x4_start + column4x4_64x64;
- const int index = cdef_index_[DivideBy16(row4x4)][DivideBy16(column4x4)];
- const int block_width4x4 =
- std::min(kStep64x64, frame_header_.columns4x4 - column4x4);
+void PostFilter::ApplyCdefWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ uint16_t cdef_block[kCdefUnitSizeWithBorders * kCdefUnitSizeWithBorders * 2];
+ // Each border_column buffer has to store 64 rows and 2 columns for each
+ // plane. For 10bit, that is 64*2*2 = 256 bytes.
+ alignas(kMaxAlignment) uint8_t border_columns[2][kMaxPlanes][256];
+ while ((row4x4 = row4x4_atomic->fetch_add(
+ kStep64x64, std::memory_order_relaxed)) < frame_header_.rows4x4) {
const int block_height4x4 =
std::min(kStep64x64, frame_header_.rows4x4 - row4x4);
-
- ApplyCdefForOneUnit<Pixel>(cdef_block, index, block_width4x4,
- block_height4x4, row4x4, column4x4);
+ ApplyCdefForOneSuperBlockRowHelper(cdef_block, border_columns, row4x4,
+ block_height4x4);
}
}
-// Each thread processes one row inside the window.
-// Y, U, V planes are processed together inside one thread.
-template <typename Pixel>
-void PostFilter::ApplyCdefThreaded() {
- assert((window_buffer_height_ & 63) == 0);
- const int num_workers = thread_pool_->num_threads();
- const int window_buffer_plane_size =
- window_buffer_width_ * window_buffer_height_;
- const int window_buffer_height4x4 = DivideBy4(window_buffer_height_);
- for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
- row4x4 += window_buffer_height4x4) {
- const int actual_window_height4x4 =
- std::min(window_buffer_height4x4, frame_header_.rows4x4 - row4x4);
- const int vertical_units_per_window =
- DivideBy16(actual_window_height4x4 + 15);
- for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
- column4x4 += DivideBy4(window_buffer_width_)) {
- const int jobs_for_threadpool =
- vertical_units_per_window * num_workers / (num_workers + 1);
- BlockingCounter pending_jobs(jobs_for_threadpool);
- int job_count = 0;
- for (int row64x64 = 0; row64x64 < actual_window_height4x4;
- row64x64 += kStep64x64) {
- if (job_count < jobs_for_threadpool) {
- thread_pool_->Schedule(
- [this, row4x4, column4x4, row64x64, &pending_jobs]() {
- ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
- pending_jobs.Decrement();
- });
- } else {
- ApplyCdefForOneRowInWindow<Pixel>(row4x4 + row64x64, column4x4);
- }
- ++job_count;
- }
- pending_jobs.Wait();
-
- // Copy |threaded_window_buffer_| to |cdef_buffer_|.
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- const ptrdiff_t src_stride =
- frame_buffer_.stride(plane) / sizeof(Pixel);
- const int plane_row = MultiplyBy4(row4x4) >> subsampling_y_[plane];
- const int plane_column =
- MultiplyBy4(column4x4) >> subsampling_x_[plane];
- int copy_width = std::min(frame_header_.columns4x4 - column4x4,
- DivideBy4(window_buffer_width_));
- copy_width = MultiplyBy4(copy_width) >> subsampling_x_[plane];
- int copy_height =
- std::min(frame_header_.rows4x4 - row4x4, window_buffer_height4x4);
- copy_height = MultiplyBy4(copy_height) >> subsampling_y_[plane];
- CopyPlane<Pixel>(
- reinterpret_cast<const Pixel*>(threaded_window_buffer_) +
- plane * window_buffer_plane_size,
- window_buffer_width_, copy_width, copy_height,
- reinterpret_cast<Pixel*>(cdef_buffer_[plane]) +
- plane_row * src_stride + plane_column,
- src_stride);
- }
- }
- }
-}
-
-void PostFilter::ApplyCdef() {
-#if LIBGAV1_MAX_BITDEPTH >= 10
- if (bitdepth_ >= 10) {
- ApplyCdefThreaded<uint16_t>();
- return;
- }
-#endif
- ApplyCdefThreaded<uint8_t>();
-}
-
} // namespace libgav1
diff --git a/libgav1/src/post_filter/deblock.cc b/libgav1/src/post_filter/deblock.cc
index c4e0852..9b5ed0f 100644
--- a/libgav1/src/post_filter/deblock.cc
+++ b/libgav1/src/post_filter/deblock.cc
@@ -14,7 +14,6 @@
#include <atomic>
#include "src/post_filter.h"
-#include "src/utils/blocking_counter.h"
namespace libgav1 {
namespace {
@@ -261,7 +260,7 @@
kDeblockFilterLevelIndex[kPlaneU][kLoopFilterTypeVertical];
const int filter_id_v =
kDeblockFilterLevelIndex[kPlaneV][kLoopFilterTypeVertical];
- const BlockParameters* bp_prev = *(bp_ptr - (1 << subsampling_x));
+ const BlockParameters* bp_prev = *(bp_ptr - (ptrdiff_t{1} << subsampling_x));
if (bp == bp_prev) {
// Not a border.
@@ -299,7 +298,7 @@
void PostFilter::HorizontalDeblockFilter(int row4x4_start,
int column4x4_start) {
const int column_step = 1;
- const size_t src_step = MultiplyBy4(pixel_size_);
+ const int src_step = 4 << pixel_size_log2_;
const ptrdiff_t src_stride = frame_buffer_.stride(kPlaneY);
uint8_t* src = GetSourceBuffer(kPlaneY, row4x4_start, column4x4_start);
int row_step;
@@ -383,6 +382,7 @@
BlockParameters* const* bp_row_base =
block_parameters_.Address(row4x4_start, column4x4_start);
const int bp_stride = block_parameters_.columns4x4();
+ const int column_step_shift = pixel_size_log2_;
for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
MultiplyBy4(row4x4_start + row4x4) < height_;
++row4x4, src += row_stride, bp_row_base += bp_stride) {
@@ -400,7 +400,7 @@
src_row, src_stride, outer_thresh_[level], inner_thresh_[level],
HevThresh(level));
}
- src_row += column_step * pixel_size_;
+ src_row += column_step << column_step_shift;
column_step = DivideBy4(column_step);
}
}
@@ -424,7 +424,7 @@
BlockParameters* const* bp_row_base = block_parameters_.Address(
GetDeblockPosition(row4x4_start, subsampling_y),
GetDeblockPosition(column4x4_start, subsampling_x));
- const int bp_stride = block_parameters_.columns4x4() * row_step;
+ const int bp_stride = block_parameters_.columns4x4() << subsampling_y;
for (int row4x4 = 0; row4x4 < kNum4x4InLoopFilterUnit &&
MultiplyBy4(row4x4_start + row4x4) < height_;
row4x4 += row_step, src_u += row_stride_u, src_v += row_stride_v,
@@ -450,8 +450,8 @@
src_row_v, src_stride_v, outer_thresh_[level_v],
inner_thresh_[level_v], HevThresh(level_v));
}
- src_row_u += column_step * pixel_size_;
- src_row_v += column_step * pixel_size_;
+ src_row_u += column_step << column_step_shift;
+ src_row_v += column_step << column_step_shift;
column_step = DivideBy4(column_step << subsampling_x);
}
}
@@ -481,67 +481,23 @@
}
}
-void PostFilter::DeblockFilterWorker(int jobs_per_plane,
- const Plane* /*planes*/,
- int /*num_planes*/,
- std::atomic<int>* job_counter,
- DeblockFilter deblock_filter) {
- const int total_jobs = jobs_per_plane;
- int job_index;
- while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
- total_jobs) {
- const int row_unit = job_index % jobs_per_plane;
- const int row4x4 = row_unit * kNum4x4InLoopFilterUnit;
+template <LoopFilterType loop_filter_type>
+void PostFilter::DeblockFilterWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopFilterUnit,
+ std::memory_order_relaxed)) <
+ frame_header_.rows4x4) {
for (int column4x4 = 0; column4x4 < frame_header_.columns4x4;
column4x4 += kNum4x4InLoopFilterUnit) {
- (this->*deblock_filter)(row4x4, column4x4);
+ (this->*deblock_filter_func_[loop_filter_type])(row4x4, column4x4);
}
}
}
-void PostFilter::ApplyDeblockFilterThreaded() {
- const int jobs_per_plane = DivideBy16(frame_header_.rows4x4 + 15);
- const int num_workers = thread_pool_->num_threads();
- std::array<Plane, kMaxPlanes> planes;
- planes[0] = kPlaneY;
- int num_planes = 1;
- for (int plane = kPlaneU; plane < planes_; ++plane) {
- if (frame_header_.loop_filter.level[plane + 1] != 0) {
- planes[num_planes++] = static_cast<Plane>(plane);
- }
- }
- // The vertical filters are not dependent on each other. So simply schedule
- // them for all possible rows.
- //
- // The horizontal filter for a row/column depends on the vertical filter being
- // finished for the blocks to the top and to the right. To work around
- // this synchronization, we simply wait for the vertical filter to finish for
- // all rows. Now, the horizontal filters can also be scheduled
- // unconditionally similar to the vertical filters.
- //
- // The only synchronization involved is to know when the each directional
- // filter is complete for the entire frame.
- for (const auto& type :
- {kLoopFilterTypeVertical, kLoopFilterTypeHorizontal}) {
- const DeblockFilter deblock_filter = deblock_filter_func_[type];
- std::atomic<int> job_counter(0);
- BlockingCounter pending_workers(num_workers);
- for (int i = 0; i < num_workers; ++i) {
- thread_pool_->Schedule([this, jobs_per_plane, &planes, num_planes,
- &job_counter, deblock_filter,
- &pending_workers]() {
- DeblockFilterWorker(jobs_per_plane, planes.data(), num_planes,
- &job_counter, deblock_filter);
- pending_workers.Decrement();
- });
- }
- // Run the jobs on the current thread.
- DeblockFilterWorker(jobs_per_plane, planes.data(), num_planes, &job_counter,
- deblock_filter);
- // Wait for the threadpool jobs to finish.
- pending_workers.Wait();
- }
-}
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>(
+ std::atomic<int>* row4x4_atomic);
+template void PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>(
+ std::atomic<int>* row4x4_atomic);
void PostFilter::ApplyDeblockFilter(LoopFilterType loop_filter_type,
int row4x4_start, int column4x4_start,
diff --git a/libgav1/src/post_filter/loop_restoration.cc b/libgav1/src/post_filter/loop_restoration.cc
index 17670b9..826ef48 100644
--- a/libgav1/src/post_filter/loop_restoration.cc
+++ b/libgav1/src/post_filter/loop_restoration.cc
@@ -15,182 +15,103 @@
#include "src/utils/blocking_counter.h"
namespace libgav1 {
-namespace {
template <typename Pixel>
-void CopyTwoRows(const Pixel* src, const ptrdiff_t src_stride, Pixel** dst,
- const ptrdiff_t dst_stride, const int width) {
- for (int i = 0; i < kRestorationVerticalBorder; ++i) {
- memcpy(*dst, src, sizeof(Pixel) * width);
- src += src_stride;
- *dst += dst_stride;
- }
-}
-
-} // namespace
-
-// static
-template <typename Pixel>
-void PostFilter::PrepareLoopRestorationBlock(
- const Pixel* src_buffer, const ptrdiff_t src_stride,
- const Pixel* deblock_buffer, const ptrdiff_t deblock_stride, Pixel* dst,
- const ptrdiff_t dst_stride, const int width, const int height,
- const bool frame_top_border, const bool frame_bottom_border) {
- src_buffer -=
- kRestorationVerticalBorder * src_stride + kRestorationHorizontalBorder;
- deblock_buffer -= kRestorationHorizontalBorder;
- int h = height;
- // Top 2 rows.
- if (frame_top_border) {
- h += kRestorationVerticalBorder;
- } else {
- CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride,
- width + 2 * kRestorationHorizontalBorder);
- src_buffer += kRestorationVerticalBorder * src_stride;
- // If |frame_top_border| is true, then we are in the first superblock row,
- // so in that case, do not increment |deblock_buffer| since we don't store
- // anything from the first superblock row into |deblock_buffer|.
- deblock_buffer += 4 * deblock_stride;
- }
- if (frame_bottom_border) h += kRestorationVerticalBorder;
- // Main body.
- do {
- memcpy(dst, src_buffer,
- sizeof(Pixel) * (width + 2 * kRestorationHorizontalBorder));
- src_buffer += src_stride;
- dst += dst_stride;
- } while (--h != 0);
- // Bottom 2 rows.
- if (!frame_bottom_border) {
- deblock_buffer += kRestorationVerticalBorder * deblock_stride;
- CopyTwoRows<Pixel>(deblock_buffer, deblock_stride, &dst, dst_stride,
- width + 2 * kRestorationHorizontalBorder);
- }
-}
-
-template void PostFilter::PrepareLoopRestorationBlock<uint8_t>(
- const uint8_t* src_buffer, ptrdiff_t src_stride,
- const uint8_t* deblock_buffer, ptrdiff_t deblock_stride, uint8_t* dst,
- ptrdiff_t dst_stride, const int width, const int height,
- const bool frame_top_border, const bool frame_bottom_border);
-
-#if LIBGAV1_MAX_BITDEPTH >= 10
-template void PostFilter::PrepareLoopRestorationBlock<uint16_t>(
- const uint16_t* src_buffer, ptrdiff_t src_stride,
- const uint16_t* deblock_buffer, ptrdiff_t deblock_stride, uint16_t* dst,
- ptrdiff_t dst_stride, const int width, const int height,
- const bool frame_top_border, const bool frame_bottom_border);
-#endif
-
-template <typename Pixel>
-void PostFilter::ApplyLoopRestorationForOneRowInWindow(
- const Pixel* src_buffer, const Plane plane, const int plane_height,
- const int plane_width, const int y, const int x, const int row,
+void PostFilter::ApplyLoopRestorationForOneRow(
+ const Pixel* src_buffer, const ptrdiff_t stride, const Plane plane,
+ const int plane_height, const int plane_width, const int unit_y,
const int unit_row, const int current_process_unit_height,
- const int plane_unit_size, const int window_width,
- Array2DView<Pixel>* const loop_restored_window) {
+ const int plane_unit_size, Pixel* dst_buffer) {
const int num_horizontal_units =
restoration_info_->num_horizontal_units(static_cast<Plane>(plane));
- const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
const RestorationUnitInfo* const restoration_info =
restoration_info_->loop_restoration_info(static_cast<Plane>(plane),
unit_row * num_horizontal_units);
- int unit_column = x / plane_unit_size;
- src_buffer += (y + row) * src_stride + x;
+ const bool in_place = DoCdef() || thread_pool_ != nullptr;
+ const Pixel* border = nullptr;
+ ptrdiff_t border_stride = 0;
+ src_buffer += unit_y * stride;
+ if (in_place) {
+ const int border_unit_y = std::max(
+ RightShiftWithCeiling(unit_y, 4 - subsampling_y_[plane]) - 4, 0);
+ border_stride = loop_restoration_border_.stride(plane) / sizeof(Pixel);
+ border =
+ reinterpret_cast<const Pixel*>(loop_restoration_border_.data(plane)) +
+ border_unit_y * border_stride;
+ }
+ int unit_column = 0;
int column = 0;
do {
- const int unit_x = x + column;
- const int unit_y = y + row;
const int current_process_unit_width =
- std::min(plane_unit_size, plane_width - unit_x);
+ std::min(plane_unit_size, plane_width - column);
const Pixel* src = src_buffer + column;
unit_column = std::min(unit_column, num_horizontal_units - 1);
if (restoration_info[unit_column].type == kLoopRestorationTypeNone) {
- const ptrdiff_t dst_stride = loop_restored_window->columns();
- Pixel* dst = &(*loop_restored_window)[row][column];
- for (int k = 0; k < current_process_unit_height; ++k) {
- if (DoCdef()) {
+ Pixel* dst = dst_buffer + column;
+ if (in_place) {
+ int k = current_process_unit_height;
+ do {
memmove(dst, src, current_process_unit_width * sizeof(Pixel));
- } else {
- memcpy(dst, src, current_process_unit_width * sizeof(Pixel));
- }
- src += src_stride;
- dst += dst_stride;
+ src += stride;
+ dst += stride;
+ } while (--k != 0);
+ } else {
+ CopyPlane(src, stride, current_process_unit_width,
+ current_process_unit_height, dst, stride);
}
} else {
- const ptrdiff_t block_buffer_stride = kRestorationUnitWidthWithBorders;
- // The SIMD implementation of wiener filter over-reads 15 -
- // |kRestorationHorizontalBorder| bytes, and the SIMD implementation of
- // self-guided filter over-reads up to 7 bytes which happens when
- // |current_process_unit_width| equals |kRestorationUnitWidth| - 7, and
- // the radius of the first pass in sfg is 0. So add 8 extra bytes at the
- // end of block_buffer for 8 bit.
- Pixel
- block_buffer[kRestorationUnitHeightWithBorders * block_buffer_stride +
- ((sizeof(Pixel) == 1) ? 15 - kRestorationHorizontalBorder
- : 0)];
- RestorationBuffer restoration_buffer;
- const Pixel* source;
- ptrdiff_t source_stride;
- if (DoCdef()) {
- const int deblock_buffer_units = 64 >> subsampling_y_[plane];
- const auto* const deblock_buffer =
- reinterpret_cast<const Pixel*>(deblock_buffer_.data(plane));
- assert(deblock_buffer != nullptr);
- const ptrdiff_t deblock_buffer_stride =
- deblock_buffer_.stride(plane) / sizeof(Pixel);
- const int deblock_unit_y =
- std::max(MultiplyBy4(Ceil(unit_y, deblock_buffer_units)) - 4, 0);
- const Pixel* const deblock_unit_buffer =
- deblock_buffer + deblock_unit_y * deblock_buffer_stride + unit_x;
- PrepareLoopRestorationBlock<Pixel>(
- src, src_stride, deblock_unit_buffer, deblock_buffer_stride,
- block_buffer, block_buffer_stride, current_process_unit_width,
- current_process_unit_height, unit_y == 0,
- unit_y + current_process_unit_height >= plane_height);
- source = block_buffer +
- kRestorationVerticalBorder * block_buffer_stride +
- kRestorationHorizontalBorder;
- source_stride = kRestorationUnitWidthWithBorders;
- } else {
- source = src;
- source_stride = src_stride;
+ const Pixel* top_border = src - kRestorationVerticalBorder * stride;
+ ptrdiff_t top_border_stride = stride;
+ const Pixel* bottom_border = src + current_process_unit_height * stride;
+ ptrdiff_t bottom_border_stride = stride;
+ const bool frame_bottom_border =
+ (unit_y + current_process_unit_height >= plane_height);
+ if (in_place && (unit_y != 0 || !frame_bottom_border)) {
+ const Pixel* loop_restoration_border = border + column;
+ if (unit_y != 0) {
+ top_border = loop_restoration_border;
+ top_border_stride = border_stride;
+ loop_restoration_border += 4 * border_stride;
+ }
+ if (!frame_bottom_border) {
+ bottom_border = loop_restoration_border +
+ kRestorationVerticalBorder * border_stride;
+ bottom_border_stride = border_stride;
+ }
}
+ RestorationBuffer restoration_buffer;
const LoopRestorationType type = restoration_info[unit_column].type;
assert(type == kLoopRestorationTypeSgrProj ||
type == kLoopRestorationTypeWiener);
const dsp::LoopRestorationFunc restoration_func =
dsp_.loop_restorations[type - 2];
- restoration_func(source, &(*loop_restored_window)[row][column],
- restoration_info[unit_column], source_stride,
- loop_restored_window->columns(),
+ restoration_func(restoration_info[unit_column], src, stride, top_border,
+ top_border_stride, bottom_border, bottom_border_stride,
current_process_unit_width, current_process_unit_height,
- &restoration_buffer);
+ &restoration_buffer, dst_buffer + column);
}
++unit_column;
column += plane_unit_size;
- } while (column < window_width);
+ } while (column < plane_width);
}
template <typename Pixel>
-void PostFilter::ApplyLoopRestorationSingleThread(const int row4x4_start,
- const int sb4x4) {
+void PostFilter::ApplyLoopRestorationForOneSuperBlockRow(const int row4x4_start,
+ const int sb4x4) {
assert(row4x4_start >= 0);
assert(DoRestoration());
- for (int plane = 0; plane < planes_; ++plane) {
+ int plane = kPlaneY;
+ do {
if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
continue;
}
const ptrdiff_t stride = frame_buffer_.stride(plane) / sizeof(Pixel);
const int unit_height_offset =
kRestorationUnitOffset >> subsampling_y_[plane];
- const int plane_height =
- RightShiftWithRounding(height_, subsampling_y_[plane]);
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
const int plane_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
- const int num_vertical_units =
- restoration_info_->num_vertical_units(static_cast<Plane>(plane));
- const int plane_unit_size = loop_restoration_.unit_size[plane];
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_unit_size = 1 << loop_restoration_.unit_size_log2[plane];
const int plane_process_unit_height =
kRestorationUnitHeight >> subsampling_y_[plane];
int y = (row4x4_start == 0)
@@ -203,171 +124,53 @@
for (int sb_y = 0; sb_y < sb4x4;
sb_y += 16, y += current_process_unit_height) {
if (y >= plane_height) break;
- const int unit_row = std::min((y + unit_height_offset) / plane_unit_size,
- num_vertical_units - 1);
+ const int unit_row = std::min(
+ (y + unit_height_offset) >> loop_restoration_.unit_size_log2[plane],
+ restoration_info_->num_vertical_units(static_cast<Plane>(plane)) - 1);
current_process_unit_height = std::min(expected_height, plane_height - y);
expected_height = plane_process_unit_height;
- Array2DView<Pixel> loop_restored_window(
- current_process_unit_height, static_cast<int>(stride),
+ ApplyLoopRestorationForOneRow<Pixel>(
+ reinterpret_cast<Pixel*>(superres_buffer_[plane]), stride,
+ static_cast<Plane>(plane), plane_height, plane_width, y, unit_row,
+ current_process_unit_height, plane_unit_size,
reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
y * stride);
- ApplyLoopRestorationForOneRowInWindow<Pixel>(
- reinterpret_cast<Pixel*>(superres_buffer_[plane]),
- static_cast<Plane>(plane), plane_height, plane_width, y, 0, 0,
- unit_row, current_process_unit_height, plane_unit_size, plane_width,
- &loop_restored_window);
}
- }
-}
-
-// Multi-thread version of loop restoration, based on a moving window of size
-// |window_buffer_width_|x|window_buffer_height_|. Inside the moving window, we
-// create a filtering job for each row and each filtering job is submitted to
-// the thread pool. Each free thread takes one job from the thread pool and
-// completes filtering until all jobs are finished. This approach requires an
-// extra buffer (|threaded_window_buffer_|) to hold the filtering output, whose
-// size is the size of the window. It also needs block buffers (i.e.,
-// |block_buffer| in ApplyLoopRestorationForOneRowInWindow()) to store
-// intermediate results in loop restoration for each thread. After all units
-// inside the window are filtered, the output is written to the frame buffer.
-template <typename Pixel>
-void PostFilter::ApplyLoopRestorationThreaded() {
- const int plane_process_unit_height[kMaxPlanes] = {
- kRestorationUnitHeight, kRestorationUnitHeight >> subsampling_y_[kPlaneU],
- kRestorationUnitHeight >> subsampling_y_[kPlaneV]};
- Array2DView<Pixel> loop_restored_window;
- if (!DoCdef()) {
- loop_restored_window.Reset(
- window_buffer_height_, window_buffer_width_,
- reinterpret_cast<Pixel*>(threaded_window_buffer_));
- }
-
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
- continue;
- }
-
- const int unit_height_offset =
- kRestorationUnitOffset >> subsampling_y_[plane];
- auto* const src_buffer = reinterpret_cast<Pixel*>(superres_buffer_[plane]);
- const ptrdiff_t src_stride = frame_buffer_.stride(plane) / sizeof(Pixel);
- const int plane_unit_size = loop_restoration_.unit_size[plane];
- const int num_vertical_units =
- restoration_info_->num_vertical_units(static_cast<Plane>(plane));
- const int plane_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
- const int plane_height =
- RightShiftWithRounding(height_, subsampling_y_[plane]);
- PostFilter::ExtendFrame<Pixel>(
- src_buffer, plane_width, plane_height, src_stride,
- kRestorationHorizontalBorder, kRestorationHorizontalBorder,
- kRestorationVerticalBorder, kRestorationVerticalBorder);
-
- const int num_workers = thread_pool_->num_threads();
- for (int y = 0; y < plane_height; y += window_buffer_height_) {
- const int actual_window_height =
- std::min(window_buffer_height_ - ((y == 0) ? unit_height_offset : 0),
- plane_height - y);
- int vertical_units_per_window =
- (actual_window_height + plane_process_unit_height[plane] - 1) /
- plane_process_unit_height[plane];
- if (y == 0) {
- // The first row of loop restoration processing units is not 64x64, but
- // 64x56 (|unit_height_offset| = 8 rows less than other restoration
- // processing units). For u/v with subsampling, the size is halved. To
- // compute the number of vertical units per window, we need to take a
- // special handling for it.
- const int height_without_first_unit =
- actual_window_height -
- std::min(actual_window_height,
- plane_process_unit_height[plane] - unit_height_offset);
- vertical_units_per_window =
- (height_without_first_unit + plane_process_unit_height[plane] - 1) /
- plane_process_unit_height[plane] +
- 1;
- }
- const int jobs_for_threadpool =
- vertical_units_per_window * num_workers / (num_workers + 1);
- for (int x = 0; x < plane_width; x += window_buffer_width_) {
- const int actual_window_width =
- std::min(window_buffer_width_, plane_width - x);
- assert(jobs_for_threadpool < vertical_units_per_window);
- if (DoCdef()) {
- loop_restored_window.Reset(
- actual_window_height, static_cast<int>(src_stride),
- reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
- y * src_stride + x);
- }
- BlockingCounter pending_jobs(jobs_for_threadpool);
- int job_count = 0;
- int current_process_unit_height;
- for (int row = 0; row < actual_window_height;
- row += current_process_unit_height) {
- const int unit_y = y + row;
- const int expected_height = plane_process_unit_height[plane] -
- ((unit_y == 0) ? unit_height_offset : 0);
- current_process_unit_height =
- std::min(expected_height, plane_height - unit_y);
- const int unit_row =
- std::min((unit_y + unit_height_offset) / plane_unit_size,
- num_vertical_units - 1);
-
- if (job_count < jobs_for_threadpool) {
- thread_pool_->Schedule(
- [this, src_buffer, plane, plane_height, plane_width, y, x, row,
- unit_row, current_process_unit_height, plane_unit_size,
- actual_window_width, &loop_restored_window, &pending_jobs]() {
- ApplyLoopRestorationForOneRowInWindow<Pixel>(
- src_buffer, static_cast<Plane>(plane), plane_height,
- plane_width, y, x, row, unit_row,
- current_process_unit_height, plane_unit_size,
- actual_window_width, &loop_restored_window);
- pending_jobs.Decrement();
- });
- } else {
- ApplyLoopRestorationForOneRowInWindow<Pixel>(
- src_buffer, static_cast<Plane>(plane), plane_height,
- plane_width, y, x, row, unit_row, current_process_unit_height,
- plane_unit_size, actual_window_width, &loop_restored_window);
- }
- ++job_count;
- }
- // Wait for all jobs of current window to finish.
- pending_jobs.Wait();
- if (!DoCdef()) {
- // Copy |threaded_window_buffer_| to output frame.
- CopyPlane<Pixel>(
- reinterpret_cast<const Pixel*>(threaded_window_buffer_),
- window_buffer_width_, actual_window_width, actual_window_height,
- reinterpret_cast<Pixel*>(loop_restoration_buffer_[plane]) +
- y * src_stride + x,
- src_stride);
- }
- }
- if (y == 0) y -= unit_height_offset;
- }
- }
+ } while (++plane < planes_);
}
void PostFilter::ApplyLoopRestoration(const int row4x4_start, const int sb4x4) {
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
- ApplyLoopRestorationSingleThread<uint16_t>(row4x4_start, sb4x4);
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(row4x4_start, sb4x4);
return;
}
#endif
- ApplyLoopRestorationSingleThread<uint8_t>(row4x4_start, sb4x4);
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(row4x4_start, sb4x4);
}
-void PostFilter::ApplyLoopRestoration() {
- assert(threaded_window_buffer_ != nullptr);
+void PostFilter::ApplyLoopRestorationWorker(std::atomic<int>* row4x4_atomic) {
+ int row4x4;
+ // Loop Restoration operates with a lag of 8 rows (4 for chroma with
+ // subsampling) and hence we need to make sure to cover the last 8 rows of the
+ // last superblock row. So we run this loop for an extra iteration to
+ // accomplish that.
+ const int row4x4_end = frame_header_.rows4x4 + kNum4x4InLoopRestorationUnit;
+ while ((row4x4 = row4x4_atomic->fetch_add(kNum4x4InLoopRestorationUnit,
+ std::memory_order_relaxed)) <
+ row4x4_end) {
+ CopyBordersForOneSuperBlockRow(row4x4, kNum4x4InLoopRestorationUnit,
+ /*for_loop_restoration=*/true);
#if LIBGAV1_MAX_BITDEPTH >= 10
- if (bitdepth_ >= 10) {
- ApplyLoopRestorationThreaded<uint16_t>();
- return;
- }
+ if (bitdepth_ >= 10) {
+ ApplyLoopRestorationForOneSuperBlockRow<uint16_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ continue;
+ }
#endif
- ApplyLoopRestorationThreaded<uint8_t>();
+ ApplyLoopRestorationForOneSuperBlockRow<uint8_t>(
+ row4x4, kNum4x4InLoopRestorationUnit);
+ }
}
} // namespace libgav1
diff --git a/libgav1/src/post_filter/post_filter.cc b/libgav1/src/post_filter/post_filter.cc
index 6d5ef31..7671f01 100644
--- a/libgav1/src/post_filter/post_filter.cc
+++ b/libgav1/src/post_filter/post_filter.cc
@@ -24,6 +24,8 @@
#include "src/dsp/constants.h"
#include "src/dsp/dsp.h"
#include "src/utils/array_2d.h"
+#include "src/utils/blocking_counter.h"
+#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
#include "src/utils/types.h"
@@ -34,11 +36,10 @@
// Import all the constants in the anonymous namespace.
#include "src/post_filter/deblock_thresholds.inc"
-// Row indices of deblocked pixels needed by loop restoration. This is used to
-// populate the |deblock_buffer_| when cdef is on. The first dimension is
-// subsampling_y.
-constexpr int kDeblockedRowsForLoopRestoration[2][4] = {{54, 55, 56, 57},
- {26, 27, 28, 29}};
+// Row indices of loop restoration border. This is used to populate the
+// |loop_restoration_border_| when either cdef is on or multithreading is
+// enabled. The dimension is subsampling_y.
+constexpr int kLoopRestorationBorderRows[2] = {54, 26};
} // namespace
@@ -77,15 +78,13 @@
const int height, const ptrdiff_t stride,
const int left, const int right, const int top,
const int bottom) {
- const Pixel* src = frame_start;
- Pixel* dst = frame_start - left;
+ Pixel* src = frame_start;
// Copy to left and right borders.
- for (int y = 0; y < height; ++y) {
- Memset(dst, src[0], left);
- Memset(dst + left + width, src[width - 1], right);
+ int y = height;
+ do {
+ ExtendLine<Pixel>(src, width, left, right);
src += stride;
- dst += stride;
- }
+ } while (--y != 0);
// Copy to bottom borders. For performance we copy |stride| pixels
// (including some padding pixels potentially) in each row, ending at the
// bottom right border pixel. In the diagram the asterisks indicate padding
@@ -98,7 +97,7 @@
// **YYY|YZabcdef|fff
// **YYY|YZabcdef|fff <-- bottom right border pixel
assert(src == frame_start + height * stride);
- dst = const_cast<Pixel*>(src) + width + right - stride;
+ Pixel* dst = src - left;
src = dst - stride;
for (int y = 0; y < bottom; ++y) {
memcpy(dst, src, sizeof(Pixel) * stride);
@@ -159,34 +158,42 @@
sequence_header.color_config.subsampling_y},
planes_(sequence_header.color_config.is_monochrome ? kMaxPlanesMonochrome
: kMaxPlanes),
- pixel_size_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
- : sizeof(uint16_t))),
+ pixel_size_log2_(static_cast<int>((bitdepth_ == 8) ? sizeof(uint8_t)
+ : sizeof(uint16_t)) -
+ 1),
inner_thresh_(kInnerThresh[frame_header.loop_filter.sharpness]),
outer_thresh_(kOuterThresh[frame_header.loop_filter.sharpness]),
needs_chroma_deblock_(frame_header.loop_filter.level[kPlaneU + 1] != 0 ||
frame_header.loop_filter.level[kPlaneV + 1] != 0),
cdef_index_(frame_scratch_buffer->cdef_index),
inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
- threaded_window_buffer_(
- frame_scratch_buffer->threaded_window_buffer.get()),
restoration_info_(&frame_scratch_buffer->loop_restoration_info),
- superres_line_buffer_(frame_scratch_buffer->superres_line_buffer.get()),
+ superres_coefficients_{
+ frame_scratch_buffer->superres_coefficients[kPlaneTypeY].get(),
+ frame_scratch_buffer
+ ->superres_coefficients
+ [(sequence_header.color_config.is_monochrome ||
+ sequence_header.color_config.subsampling_x == 0)
+ ? kPlaneTypeY
+ : kPlaneTypeUV]
+ .get()},
+ superres_line_buffer_(frame_scratch_buffer->superres_line_buffer),
block_parameters_(frame_scratch_buffer->block_parameters_holder),
frame_buffer_(*frame_buffer),
- deblock_buffer_(frame_scratch_buffer->deblock_buffer),
+ cdef_border_(frame_scratch_buffer->cdef_border),
+ loop_restoration_border_(frame_scratch_buffer->loop_restoration_border),
do_post_filter_mask_(do_post_filter_mask),
thread_pool_(
- frame_scratch_buffer->threading_strategy.post_filter_thread_pool()),
- window_buffer_width_(GetWindowBufferWidth(thread_pool_, frame_header)),
- window_buffer_height_(GetWindowBufferHeight(thread_pool_, frame_header)) {
+ frame_scratch_buffer->threading_strategy.post_filter_thread_pool()) {
const int8_t zero_delta_lf[kFrameLfCount] = {};
ComputeDeblockFilterLevels(zero_delta_lf, deblock_filter_levels_);
if (DoSuperRes()) {
- for (int plane = 0; plane < planes_; ++plane) {
+ int plane = kPlaneY;
+ do {
const int downscaled_width =
- RightShiftWithRounding(width_, subsampling_x_[plane]);
+ SubsampledValue(width_, subsampling_x_[plane]);
const int upscaled_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
const int superres_width = downscaled_width << kSuperResScaleBits;
super_res_info_[plane].step =
(superres_width + upscaled_width / 2) / upscaled_width;
@@ -199,46 +206,58 @@
(1 << (kSuperResExtraBits - 1)) - error / 2) &
kSuperResScaleMask;
super_res_info_[plane].upscaled_width = upscaled_width;
+ } while (++plane < planes_);
+ if (dsp->super_res_coefficients != nullptr) {
+ int plane = kPlaneY;
+ const int number_loops = (superres_coefficients_[kPlaneTypeY] ==
+ superres_coefficients_[kPlaneTypeUV])
+ ? kMaxPlanesMonochrome
+ : static_cast<int>(kNumPlaneTypes);
+ do {
+ dsp->super_res_coefficients(
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]),
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, superres_coefficients_[plane]);
+ } while (++plane < number_loops);
}
}
- for (int plane = 0; plane < planes_; ++plane) {
+ int plane = kPlaneY;
+ do {
loop_restoration_buffer_[plane] = frame_buffer_.data(plane);
cdef_buffer_[plane] = frame_buffer_.data(plane);
superres_buffer_[plane] = frame_buffer_.data(plane);
source_buffer_[plane] = frame_buffer_.data(plane);
- }
- // In single threaded mode, we apply SuperRes without making a copy of the
- // input row by writing the output to one row to the top (we refer to this
- // process as "in place superres" in our code).
- const bool in_place_superres = DoSuperRes() && thread_pool_ == nullptr;
- if (DoCdef() || DoRestoration() || in_place_superres) {
- for (int plane = 0; plane < planes_; ++plane) {
+ } while (++plane < planes_);
+ if (DoCdef() || DoRestoration() || DoSuperRes()) {
+ plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
int horizontal_shift = 0;
int vertical_shift = 0;
if (DoRestoration() &&
loop_restoration_.type[plane] != kLoopRestorationTypeNone) {
horizontal_shift += frame_buffer_.alignment();
- if (!DoCdef()) {
+ if (!DoCdef() && thread_pool_ == nullptr) {
vertical_shift += kRestorationVerticalBorder;
}
superres_buffer_[plane] +=
vertical_shift * frame_buffer_.stride(plane) +
- horizontal_shift * pixel_size_;
+ (horizontal_shift << pixel_size_log2);
}
- if (in_place_superres) {
+ if (DoSuperRes()) {
vertical_shift += kSuperResVerticalBorder;
}
cdef_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
- horizontal_shift * pixel_size_;
- if (DoCdef()) {
+ (horizontal_shift << pixel_size_log2);
+ if (DoCdef() && thread_pool_ == nullptr) {
horizontal_shift += frame_buffer_.alignment();
vertical_shift += kCdefBorder;
}
assert(horizontal_shift <= frame_buffer_.right_border(plane));
assert(vertical_shift <= frame_buffer_.bottom_border(plane));
source_buffer_[plane] += vertical_shift * frame_buffer_.stride(plane) +
- horizontal_shift * pixel_size_;
- }
+ (horizontal_shift << pixel_size_log2);
+ } while (++plane < planes_);
}
}
@@ -261,11 +280,11 @@
void PostFilter::ExtendBordersForReferenceFrame() {
if (frame_header_.refresh_frame_flags == 0) return;
- for (int plane = kPlaneY; plane < planes_; ++plane) {
+ int plane = kPlaneY;
+ do {
const int plane_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
- const int plane_height =
- RightShiftWithRounding(height_, subsampling_y_[plane]);
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
assert(frame_buffer_.left_border(plane) >= kMinLeftBorderPixels &&
frame_buffer_.right_border(plane) >= kMinRightBorderPixels &&
frame_buffer_.top_border(plane) >= kMinTopBorderPixels &&
@@ -283,45 +302,31 @@
frame_buffer_.stride(plane), frame_buffer_.left_border(plane),
frame_buffer_.right_border(plane), frame_buffer_.top_border(plane),
frame_buffer_.bottom_border(plane));
- }
+ } while (++plane < planes_);
}
void PostFilter::CopyDeblockedPixels(Plane plane, int row4x4) {
const ptrdiff_t src_stride = frame_buffer_.stride(plane);
- const uint8_t* const src =
- GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0);
- const ptrdiff_t dst_stride = deblock_buffer_.stride(plane);
+ const uint8_t* const src = GetSourceBuffer(plane, row4x4, 0);
const int row_offset = DivideBy4(row4x4);
- uint8_t* dst = deblock_buffer_.data(plane) + dst_stride * row_offset;
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst = loop_restoration_border_.data(plane) + row_offset * dst_stride;
const int num_pixels = SubsampledValue(MultiplyBy4(frame_header_.columns4x4),
subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
int last_valid_row = -1;
const int plane_height =
SubsampledValue(frame_header_.height, subsampling_y_[plane]);
- for (int i = 0; i < 4; ++i) {
- int row = kDeblockedRowsForLoopRestoration[subsampling_y_[plane]][i];
- const int absolute_row =
- (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
- if (absolute_row >= plane_height) {
- if (last_valid_row == -1) {
- // We have run out of rows and there no valid row to copy. This will not
- // be used by loop restoration, so we can simply break here. However,
- // MSAN does not know that this is never used (since we sometimes apply
- // superres to this row as well). So zero it out in case of MSAN.
-#if LIBGAV1_MSAN
- if (DoSuperRes()) {
- memset(dst, 0, num_pixels * pixel_size_);
- dst += dst_stride;
- continue;
- }
-#endif
- break;
- }
+ int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row = (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ for (int i = 0; i < 4; ++i, ++row) {
+ if (absolute_row + i >= plane_height) {
+ if (last_valid_row == -1) break;
// If we run out of rows, copy the last valid row (mimics the bottom
// border extension).
row = last_valid_row;
}
- memcpy(dst, src + src_stride * row, num_pixels * pixel_size_);
+ memcpy(dst, src + row * src_stride, row_width);
last_valid_row = row;
dst += dst_stride;
}
@@ -334,20 +339,21 @@
const int row_offset = (row4x4 == 0) ? 0 : 8;
// Number of rows to be subtracted from the height described by sb4x4.
const int height_offset = (row4x4 == 0) ? 8 : 0;
- // If cdef is off, then loop restoration needs 2 extra rows for the bottom
- // border in each plane.
- const int extra_rows = (for_loop_restoration && !DoCdef()) ? 2 : 0;
- for (int plane = 0; plane < planes_; ++plane) {
+ // If cdef is off and post filter multithreading is off, then loop restoration
+ // needs 2 extra rows for the bottom border in each plane.
+ const int extra_rows =
+ (for_loop_restoration && thread_pool_ == nullptr && !DoCdef()) ? 2 : 0;
+ int plane = kPlaneY;
+ do {
const int plane_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
- const int plane_height =
- RightShiftWithRounding(height_, subsampling_y_[plane]);
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
const int row = (MultiplyBy4(row4x4) - row_offset) >> subsampling_y_[plane];
assert(row >= 0);
if (row >= plane_height) break;
const int num_rows =
- std::min(RightShiftWithRounding(MultiplyBy4(sb4x4) - height_offset,
- subsampling_y_[plane]) +
+ std::min(SubsampledValue(MultiplyBy4(sb4x4) - height_offset,
+ subsampling_y_[plane]) +
extra_rows,
plane_height - row);
// We only need to track the progress of the Y plane since the progress of
@@ -377,20 +383,182 @@
: 0;
ExtendFrameBoundary(start, plane_width, num_rows, stride, left_border,
right_border, top_border, bottom_border);
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(const int row4x4) {
+ assert(row4x4 >= 0);
+ assert(!DoCdef());
+ assert(DoRestoration());
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ const int row_offset = DivideBy4(row4x4);
+ const int num_pixels =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ const int row_width = num_pixels << pixel_size_log2_;
+ const int plane_height = SubsampledValue(height_, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ const ptrdiff_t src_stride = frame_buffer_.stride(plane);
+ const uint8_t* src =
+ GetSuperResBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * src_stride;
+ const ptrdiff_t dst_stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst =
+ loop_restoration_border_.data(plane) + row_offset * dst_stride;
+ for (int i = 0; i < 4; ++i) {
+ memcpy(dst, src, row_width);
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ ExtendLine<uint8_t>(dst, num_pixels, kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ if (absolute_row + i < plane_height - 1) src += src_stride;
+ dst += dst_stride;
+ }
+ } while (++plane < planes_);
+}
+
+void PostFilter::SetupLoopRestorationBorder(int row4x4_start, int sb4x4) {
+ assert(row4x4_start >= 0);
+ assert(DoCdef());
+ assert(DoRestoration());
+ for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
+ const int row4x4 = row4x4_start + sb_y;
+ const int row_offset_start = DivideBy4(row4x4);
+ const std::array<uint8_t*, kMaxPlanes> dst = {
+ loop_restoration_border_.data(kPlaneY) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneY),
+ loop_restoration_border_.data(kPlaneU) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneU),
+ loop_restoration_border_.data(kPlaneV) +
+ row_offset_start * loop_restoration_border_.stride(kPlaneV)};
+ // If SuperRes is enabled, then we apply SuperRes for the rows to be copied
+ // directly with |loop_restoration_border_| as the destination. Otherwise,
+ // we simply copy the rows.
+ if (DoSuperRes()) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<int, kMaxPlanes> rows;
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ rows[plane] = 0;
+ continue;
+ }
+ const int plane_height =
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
+ const int row = kLoopRestorationBorderRows[subsampling_y_[plane]];
+ const int absolute_row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + row;
+ src[plane] = GetSourceBuffer(static_cast<Plane>(plane), row4x4, 0) +
+ row * frame_buffer_.stride(plane);
+ rows[plane] = Clip3(plane_height - absolute_row, 0, 4);
+ } while (++plane < planes_);
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst,
+ /*dst_is_loop_restoration_border=*/true);
+ // If we run out of rows, copy the last valid row (mimics the bottom
+ // border extension).
+ plane = kPlaneY;
+ do {
+ if (rows[plane] == 0 || rows[plane] >= 4) continue;
+ const ptrdiff_t stride = loop_restoration_border_.stride(plane);
+ uint8_t* dst_line = dst[plane] + rows[plane] * stride;
+ const uint8_t* const src_line = dst_line - stride;
+ const int upscaled_width = super_res_info_[plane].upscaled_width
+ << pixel_size_log2_;
+ for (int i = rows[plane]; i < 4; ++i) {
+ memcpy(dst_line, src_line, upscaled_width);
+ dst_line += stride;
+ }
+ } while (++plane < planes_);
+ } else {
+ int plane = kPlaneY;
+ do {
+ CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
+ } while (++plane < planes_);
+ }
+ // Extend the left and right boundaries needed for loop restoration.
+ int plane = kPlaneY;
+ do {
+ if (loop_restoration_.type[plane] == kLoopRestorationTypeNone) {
+ continue;
+ }
+ uint8_t* dst_line = dst[plane];
+ const int plane_width =
+ SubsampledValue(upscaled_width_, subsampling_x_[plane]);
+ for (int i = 0; i < 4; ++i) {
+#if LIBGAV1_MAX_BITDEPTH >= 10
+ if (bitdepth_ >= 10) {
+ ExtendLine<uint16_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ } else // NOLINT.
+#endif
+ {
+ ExtendLine<uint8_t>(dst_line, plane_width,
+ kRestorationHorizontalBorder,
+ kRestorationHorizontalBorder);
+ }
+ dst_line += loop_restoration_border_.stride(plane);
+ }
+ } while (++plane < planes_);
}
}
+void PostFilter::RunJobs(WorkerFunction worker) {
+ std::atomic<int> row4x4(0);
+ const int num_workers = thread_pool_->num_threads();
+ BlockingCounter pending_workers(num_workers);
+ for (int i = 0; i < num_workers; ++i) {
+ thread_pool_->Schedule([this, &row4x4, &pending_workers, worker]() {
+ (this->*worker)(&row4x4);
+ pending_workers.Decrement();
+ });
+ }
+ // Run the jobs on the current thread.
+ (this->*worker)(&row4x4);
+ // Wait for the threadpool jobs to finish.
+ pending_workers.Wait();
+}
+
void PostFilter::ApplyFilteringThreaded() {
- if (DoDeblock()) ApplyDeblockFilterThreaded();
+ if (DoDeblock()) {
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeVertical>);
+ RunJobs(&PostFilter::DeblockFilterWorker<kLoopFilterTypeHorizontal>);
+ }
if (DoCdef() && DoRestoration()) {
for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
row4x4 += kNum4x4InLoopFilterUnit) {
- SetupDeblockBuffer(row4x4, kNum4x4InLoopFilterUnit);
+ SetupLoopRestorationBorder(row4x4, kNum4x4InLoopFilterUnit);
}
}
- if (DoCdef()) ApplyCdef();
+ if (DoCdef()) {
+ for (int row4x4 = 0; row4x4 < frame_header_.rows4x4;
+ row4x4 += kNum4x4InLoopFilterUnit) {
+ SetupCdefBorder(row4x4);
+ }
+ RunJobs(&PostFilter::ApplyCdefWorker);
+ }
if (DoSuperRes()) ApplySuperResThreaded();
- if (DoRestoration()) ApplyLoopRestoration();
+ if (DoRestoration()) {
+ if (!DoCdef()) {
+ int row4x4 = 0;
+ do {
+ SetupLoopRestorationBorder(row4x4);
+ row4x4 += kNum4x4InLoopFilterUnit;
+ } while (row4x4 < frame_header_.rows4x4);
+ }
+ RunJobs(&PostFilter::ApplyLoopRestorationWorker);
+ }
ExtendBordersForReferenceFrame();
}
@@ -402,7 +570,7 @@
ApplyDeblockFilterForOneSuperBlockRow(row4x4, sb4x4);
}
if (DoRestoration() && DoCdef()) {
- SetupDeblockBuffer(row4x4, sb4x4);
+ SetupLoopRestorationBorder(row4x4, sb4x4);
}
if (DoCdef()) {
ApplyCdefForOneSuperBlockRow(row4x4, sb4x4, is_last_row);
diff --git a/libgav1/src/post_filter/super_res.cc b/libgav1/src/post_filter/super_res.cc
index f6594f4..554e537 100644
--- a/libgav1/src/post_filter/super_res.cc
+++ b/libgav1/src/post_filter/super_res.cc
@@ -15,218 +15,197 @@
#include "src/utils/blocking_counter.h"
namespace libgav1 {
-namespace {
-template <typename Pixel>
-void ExtendLine(uint8_t* const line_start, const int width, const int left,
- const int right) {
- auto* const start = reinterpret_cast<Pixel*>(line_start);
- const Pixel* src = start;
- Pixel* dst = start - left;
- // Copy to left and right borders.
- Memset(dst, src[0], left);
- Memset(dst + (left + width), src[width - 1], right);
-}
-
-} // namespace
-
-template <bool in_place>
-void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& buffers,
- const std::array<int, kMaxPlanes>& strides,
+void PostFilter::ApplySuperRes(const std::array<uint8_t*, kMaxPlanes>& src,
const std::array<int, kMaxPlanes>& rows,
- size_t line_buffer_offset) {
- // Only used when |in_place| == false.
- uint8_t* const line_buffer_start = superres_line_buffer_ +
- line_buffer_offset +
- kSuperResHorizontalBorder * pixel_size_;
- for (int plane = kPlaneY; plane < planes_; ++plane) {
- const int8_t subsampling_x = subsampling_x_[plane];
+ const int line_buffer_row,
+ const std::array<uint8_t*, kMaxPlanes>& dst,
+ bool dst_is_loop_restoration_border /*=false*/) {
+ int plane = kPlaneY;
+ do {
const int plane_width =
- MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
- uint8_t* input = buffers[plane];
- const uint32_t input_stride = strides[plane];
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
#if LIBGAV1_MAX_BITDEPTH >= 10
if (bitdepth_ >= 10) {
- for (int y = 0; y < rows[plane]; ++y, input += input_stride) {
- if (!in_place) {
- memcpy(line_buffer_start, input, plane_width * sizeof(uint16_t));
- }
- ExtendLine<uint16_t>(in_place ? input : line_buffer_start, plane_width,
- kSuperResHorizontalBorder,
- kSuperResHorizontalBorder);
- dsp_.super_res_row(in_place ? input : line_buffer_start,
- super_res_info_[plane].upscaled_width,
- super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step,
- input - (in_place ? input_stride : 0));
+ auto* input = reinterpret_cast<uint16_t*>(src[plane]);
+ auto* output = reinterpret_cast<uint16_t*>(dst[plane]);
+ const ptrdiff_t input_stride =
+ frame_buffer_.stride(plane) / sizeof(uint16_t);
+ const ptrdiff_t output_stride =
+ (dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane)) /
+ sizeof(uint16_t);
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, input_stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output, output_stride);
+ }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ auto* const line_buffer_start =
+ reinterpret_cast<uint16_t*>(superres_line_buffer_.data(plane)) +
+ line_buffer_row * superres_line_buffer_.stride(plane) /
+ sizeof(uint16_t) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step,
+ output + rows[plane] * output_stride, /*dest_stride=*/0);
}
continue;
}
#endif // LIBGAV1_MAX_BITDEPTH >= 10
- for (int y = 0; y < rows[plane]; ++y, input += input_stride) {
- if (!in_place) {
- memcpy(line_buffer_start, input, plane_width);
- }
- ExtendLine<uint8_t>(in_place ? input : line_buffer_start, plane_width,
- kSuperResHorizontalBorder, kSuperResHorizontalBorder);
- dsp_.super_res_row(in_place ? input : line_buffer_start,
- super_res_info_[plane].upscaled_width,
- super_res_info_[plane].initial_subpixel_x,
- super_res_info_[plane].step,
- input - (in_place ? input_stride : 0));
+ uint8_t* input = src[plane];
+ uint8_t* output = dst[plane];
+ const ptrdiff_t input_stride = frame_buffer_.stride(plane);
+ const ptrdiff_t output_stride = dst_is_loop_restoration_border
+ ? loop_restoration_border_.stride(plane)
+ : frame_buffer_.stride(plane);
+ if (rows[plane] > 0) {
+ dsp_.super_res(superres_coefficients_[static_cast<int>(plane != 0)],
+ input, input_stride, rows[plane], plane_width,
+ super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output, output_stride);
}
- }
+ // In the multi-threaded case, the |superres_line_buffer_| holds the last
+ // input row. Apply SuperRes for that row.
+ if (line_buffer_row >= 0) {
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ kSuperResHorizontalBorder;
+ dsp_.super_res(
+ superres_coefficients_[static_cast<int>(plane != 0)],
+ line_buffer_start, /*source_stride=*/0,
+ /*height=*/1, plane_width, super_res_info_[plane].upscaled_width,
+ super_res_info_[plane].initial_subpixel_x,
+ super_res_info_[plane].step, output + rows[plane] * output_stride,
+ /*dest_stride=*/0);
+ }
+ } while (++plane < planes_);
}
-// Used by post_filter_test.cc.
-template void PostFilter::ApplySuperRes<false>(
- const std::array<uint8_t*, kMaxPlanes>& buffers,
- const std::array<int, kMaxPlanes>& strides,
- const std::array<int, kMaxPlanes>& rows, size_t line_buffer_offset);
-
void PostFilter::ApplySuperResForOneSuperBlockRow(int row4x4_start, int sb4x4,
bool is_last_row) {
assert(row4x4_start >= 0);
assert(DoSuperRes());
// If not doing cdef, then LR needs two rows of border with superres applied.
const int num_rows_extra = (DoCdef() || !DoRestoration()) ? 0 : 2;
- std::array<uint8_t*, kMaxPlanes> buffers;
- std::array<int, kMaxPlanes> strides;
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
std::array<int, kMaxPlanes> rows;
- // Apply superres for the last 8-num_rows_extra rows of the previous
- // superblock.
- if (row4x4_start > 0) {
- const int row4x4 = row4x4_start - 2;
- for (int plane = 0; plane < planes_; ++plane) {
- const int row =
- (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
- const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
- buffers[plane] = cdef_buffer_[plane] + row_offset;
- strides[plane] = frame_buffer_.stride(plane);
- // Note that the |num_rows_extra| subtraction is done after the value is
- // subsampled since we always need to work on |num_rows_extra| extra rows
- // irrespective of the plane subsampling.
- rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
- }
- ApplySuperRes<true>(buffers, strides, rows, /*line_buffer_offset=*/0);
- }
- // Apply superres for the current superblock row (except for the last
- // 8-num_rows_extra rows).
const int num_rows4x4 =
std::min(sb4x4, frame_header_.rows4x4 - row4x4_start) -
(is_last_row ? 0 : 2);
- for (int plane = 0; plane < planes_; ++plane) {
- const ptrdiff_t row_offset =
- (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
- frame_buffer_.stride(plane);
- buffers[plane] = cdef_buffer_[plane] + row_offset;
- strides[plane] = frame_buffer_.stride(plane);
- // Note that the |num_rows_extra| subtraction is done after the value is
- // subsampled since we always need to work on |num_rows_extra| extra rows
- // irrespective of the plane subsampling.
- rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
- (is_last_row ? 0 : num_rows_extra);
+ if (row4x4_start > 0) {
+ const int row4x4 = row4x4_start - 2;
+ int plane = kPlaneY;
+ do {
+ const int row =
+ (MultiplyBy4(row4x4) >> subsampling_y_[plane]) + num_rows_extra;
+ const ptrdiff_t row_offset = row * frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| subtraction is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ // Apply superres for the last 8-|num_rows_extra| rows of the previous
+ // superblock.
+ rows[plane] = (8 >> subsampling_y_[plane]) - num_rows_extra;
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ rows[plane] += (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
+ } else {
+ // Apply superres for the current superblock row (except for the last
+ // 8-|num_rows_extra| rows).
+ int plane = kPlaneY;
+ do {
+ const ptrdiff_t row_offset =
+ (MultiplyBy4(row4x4_start) >> subsampling_y_[plane]) *
+ frame_buffer_.stride(plane);
+ src[plane] = cdef_buffer_[plane] + row_offset;
+ dst[plane] = superres_buffer_[plane] + row_offset;
+ // Note that the |num_rows_extra| addition is done after the value is
+ // subsampled since we always need to work on |num_rows_extra| extra rows
+ // irrespective of the plane subsampling.
+ rows[plane] = (MultiplyBy4(num_rows4x4) >> subsampling_y_[plane]) +
+ (is_last_row ? 0 : num_rows_extra);
+ } while (++plane < planes_);
}
- ApplySuperRes<true>(buffers, strides, rows, /*line_buffer_offset=*/0);
+ ApplySuperRes(src, rows, /*line_buffer_row=*/-1, dst);
}
void PostFilter::ApplySuperResThreaded() {
- const int num_threads = thread_pool_->num_threads() + 1;
- // The number of rows4x4 that will be processed by each thread in the thread
- // pool (other than the current thread).
- const int thread_pool_rows4x4 = frame_header_.rows4x4 / num_threads;
- // For the current thread, we round up to process all the remaining rows so
- // that the current thread's job will potentially run the longest.
- const int current_thread_rows4x4 =
- frame_header_.rows4x4 - (thread_pool_rows4x4 * (num_threads - 1));
- // The size of the line buffer required by each thread. In the multi-threaded
- // case we are guaranteed to have a line buffer which can store |num_threads|
- // rows at the same time.
- const size_t line_buffer_size =
- (MultiplyBy4(frame_header_.columns4x4) +
- MultiplyBy2(kSuperResHorizontalBorder) + kSuperResHorizontalPadding) *
- pixel_size_;
- size_t line_buffer_offset = 0;
+ int num_threads = thread_pool_->num_threads() + 1;
+ // The number of rows that will be processed by each thread in the thread pool
+ // (other than the current thread).
+ int thread_pool_rows = height_ / num_threads;
+ thread_pool_rows = std::max(thread_pool_rows, 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((thread_pool_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++thread_pool_rows;
+ }
+ // Adjust the number of threads to what we really need.
+ num_threads = Clip3(height_ / thread_pool_rows, 1, num_threads);
+ // For the current thread, we round up to process all the remaining rows.
+ int current_thread_rows = height_ - thread_pool_rows * (num_threads - 1);
+ // Make rows of Y plane even when there is subsampling for the other planes.
+ if ((current_thread_rows & 1) != 0 && subsampling_y_[kPlaneU] != 0) {
+ ++current_thread_rows;
+ }
+ assert(current_thread_rows > 0);
BlockingCounter pending_workers(num_threads - 1);
- for (int i = 0, row4x4_start = 0; i < num_threads; ++i,
- row4x4_start += thread_pool_rows4x4,
- line_buffer_offset += line_buffer_size) {
- std::array<uint8_t*, kMaxPlanes> buffers;
- std::array<int, kMaxPlanes> strides;
+ for (int line_buffer_row = 0, row_start = 0; line_buffer_row < num_threads;
+ ++line_buffer_row, row_start += thread_pool_rows) {
+ std::array<uint8_t*, kMaxPlanes> src;
+ std::array<uint8_t*, kMaxPlanes> dst;
std::array<int, kMaxPlanes> rows;
- for (int plane = 0; plane < planes_; ++plane) {
- strides[plane] = frame_buffer_.stride(plane);
- buffers[plane] =
- GetBufferOffset(cdef_buffer_[plane], strides[plane],
- static_cast<Plane>(plane), row4x4_start, 0);
- if (i < num_threads - 1) {
- rows[plane] = MultiplyBy4(thread_pool_rows4x4) >> subsampling_y_[plane];
- } else {
- rows[plane] =
- MultiplyBy4(current_thread_rows4x4) >> subsampling_y_[plane];
- }
- }
- if (i < num_threads - 1) {
- thread_pool_->Schedule([this, buffers, strides, rows, line_buffer_offset,
- &pending_workers]() {
- ApplySuperRes<false>(buffers, strides, rows, line_buffer_offset);
- pending_workers.Decrement();
- });
+ int plane = kPlaneY;
+ const int pixel_size_log2 = pixel_size_log2_;
+ do {
+ src[plane] =
+ GetBufferOffset(cdef_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ dst[plane] =
+ GetBufferOffset(superres_buffer_[plane], frame_buffer_.stride(plane),
+ static_cast<Plane>(plane), row_start, 0);
+ rows[plane] =
+ (((line_buffer_row < num_threads - 1) ? thread_pool_rows
+ : current_thread_rows) >>
+ subsampling_y_[plane]) -
+ 1;
+ const int plane_width =
+ MultiplyBy4(frame_header_.columns4x4) >> subsampling_x_[plane];
+ uint8_t* const input =
+ src[plane] + rows[plane] * frame_buffer_.stride(plane);
+ uint8_t* const line_buffer_start =
+ superres_line_buffer_.data(plane) +
+ line_buffer_row * superres_line_buffer_.stride(plane) +
+ (kSuperResHorizontalBorder << pixel_size_log2);
+ memcpy(line_buffer_start, input, plane_width << pixel_size_log2);
+ } while (++plane < planes_);
+ if (line_buffer_row < num_threads - 1) {
+ thread_pool_->Schedule(
+ [this, src, rows, line_buffer_row, dst, &pending_workers]() {
+ ApplySuperRes(src, rows, line_buffer_row, dst);
+ pending_workers.Decrement();
+ });
} else {
- ApplySuperRes<false>(buffers, strides, rows, line_buffer_offset);
+ ApplySuperRes(src, rows, line_buffer_row, dst);
}
}
// Wait for the threadpool jobs to finish.
pending_workers.Wait();
}
-// This function lives in this file so that it has access to ExtendLine<>.
-void PostFilter::SetupDeblockBuffer(int row4x4_start, int sb4x4) {
- assert(row4x4_start >= 0);
- assert(DoCdef());
- assert(DoRestoration());
- for (int sb_y = 0; sb_y < sb4x4; sb_y += 16) {
- const int row4x4 = row4x4_start + sb_y;
- for (int plane = 0; plane < planes_; ++plane) {
- CopyDeblockedPixels(static_cast<Plane>(plane), row4x4);
- }
- const int row_offset_start = DivideBy4(row4x4);
- if (DoSuperRes()) {
- std::array<uint8_t*, kMaxPlanes> buffers = {
- deblock_buffer_.data(kPlaneY) +
- row_offset_start * deblock_buffer_.stride(kPlaneY),
- deblock_buffer_.data(kPlaneU) +
- row_offset_start * deblock_buffer_.stride(kPlaneU),
- deblock_buffer_.data(kPlaneV) +
- row_offset_start * deblock_buffer_.stride(kPlaneV)};
- std::array<int, kMaxPlanes> strides = {deblock_buffer_.stride(kPlaneY),
- deblock_buffer_.stride(kPlaneU),
- deblock_buffer_.stride(kPlaneV)};
- std::array<int, kMaxPlanes> rows = {4, 4, 4};
- ApplySuperRes<false>(buffers, strides, rows,
- /*line_buffer_offset=*/0);
- }
- // Extend the left and right boundaries needed for loop restoration.
- for (int plane = 0; plane < planes_; ++plane) {
- uint8_t* src = deblock_buffer_.data(plane) +
- row_offset_start * deblock_buffer_.stride(plane);
- const int plane_width =
- RightShiftWithRounding(upscaled_width_, subsampling_x_[plane]);
- for (int i = 0; i < 4; ++i) {
-#if LIBGAV1_MAX_BITDEPTH >= 10
- if (bitdepth_ >= 10) {
- ExtendLine<uint16_t>(src, plane_width, kRestorationHorizontalBorder,
- kRestorationHorizontalBorder);
- } else // NOLINT.
-#endif
- {
- ExtendLine<uint8_t>(src, plane_width, kRestorationHorizontalBorder,
- kRestorationHorizontalBorder);
- }
- src += deblock_buffer_.stride(plane);
- }
- }
- }
-}
-
} // namespace libgav1
diff --git a/libgav1/src/quantizer.cc b/libgav1/src/quantizer.cc
index b26024d..cd720d6 100644
--- a/libgav1/src/quantizer.cc
+++ b/libgav1/src/quantizer.cc
@@ -18,6 +18,7 @@
#include <cstdint>
#include "src/utils/common.h"
+#include "src/utils/constants.h"
#if LIBGAV1_MAX_BITDEPTH != 8 && LIBGAV1_MAX_BITDEPTH != 10
#error LIBGAV1_MAX_BITDEPTH must be 8 or 10
@@ -26,6 +27,9 @@
namespace libgav1 {
namespace {
+// Import all the constants in the anonymous namespace.
+#include "src/quantizer_tables.inc"
+
// Format the kDcLookup and kAcLookup arrays manually for easier comparison
// with the Dc_Qlookup and Ac_Qlookup arrays in Section 7.12.2.
@@ -141,8 +145,99 @@
};
// clang-format on
+void Transpose(uint8_t* const dst, const uint8_t* const src, int src_width,
+ int src_height) {
+ const int dst_width = src_height;
+ const int dst_height = src_width;
+ Array2DView<const uint8_t> source(src_height, src_width, src);
+ Array2DView<uint8_t> dest(dst_height, dst_width, dst);
+ for (int y = 0; y < dst_height; ++y) {
+ for (int x = 0; x < dst_width; ++x) {
+ dest[y][x] = source[x][y];
+ }
+ }
+}
+
+// Copies the lower triangle and fills the upper triangle of |dst| using |src|
+// as the source.
+void FillUpperTriangle(uint8_t* dst, const uint8_t* src, int size) {
+ Array2DView<uint8_t> dest(size, size, dst);
+ int k = 0;
+ for (int y = 0; y < size; ++y) {
+ for (int x = 0; x <= y; ++x) {
+ dest[y][x] = dest[x][y] = src[k++];
+ }
+ }
+}
+
} // namespace
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix_ptr) {
+ for (int level = 0; level < kNumQuantizerLevelsForQuantizerMatrix; ++level) {
+ for (int plane_type = kPlaneTypeY; plane_type < kNumPlaneTypes;
+ ++plane_type) {
+ auto& quantizer_matrix = (*quantizer_matrix_ptr)[level][plane_type];
+ // Notes about how these matrices are populated:
+ // * For square transforms, we store only the lower left triangle (it is
+ // symmetric about the main diagonal. So when populating the matrix, we
+ // will have to fill in the upper right triangle.
+ // * For rectangular transforms, the matrices are transposes when the
+ // width and height are reversed. So when populating we populate it with
+ // memcpy when w < h and populate it by transposing when w > h.
+ // * There is a special case for 16x16 where the matrix is the same as
+ // 32x32 with some offsets.
+ // * We use the "adjusted transform size" when using these matrices, so we
+ // won't have to populate them for transform sizes with one of the
+ // dimensions equal to 64.
+ for (int tx_size = 0; tx_size < kNumTransformSizes; ++tx_size) {
+ if (kTransformWidth[tx_size] == 64 || kTransformHeight[tx_size] == 64) {
+ continue;
+ }
+ const int size = kTransformWidth[tx_size] * kTransformHeight[tx_size];
+ if (!quantizer_matrix[tx_size].Resize(size)) {
+ return false;
+ }
+ }
+#define QUANTIZER_MEMCPY(W, H) \
+ memcpy(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##W##x##H[level][plane_type], (W) * (H))
+#define QUANTIZER_TRANSPOSE(W, H) \
+ Transpose(quantizer_matrix[kTransformSize##W##x##H].get(), \
+ kQuantizerMatrix##H##x##W[level][plane_type], H, W)
+#define QUANTIZER_FILL_UPPER_TRIANGLE(SIZE) \
+ FillUpperTriangle(quantizer_matrix[kTransformSize##SIZE##x##SIZE].get(), \
+ kQuantizerMatrix##SIZE##x##SIZE[level][plane_type], SIZE)
+ QUANTIZER_FILL_UPPER_TRIANGLE(4); // 4x4
+ QUANTIZER_MEMCPY(4, 8); // 4x8
+ QUANTIZER_MEMCPY(4, 16); // 4x16
+ QUANTIZER_TRANSPOSE(8, 4); // 8x4
+ QUANTIZER_FILL_UPPER_TRIANGLE(8); // 8x8
+ QUANTIZER_MEMCPY(8, 16); // 8x16
+ QUANTIZER_MEMCPY(8, 32); // 8x32
+ QUANTIZER_TRANSPOSE(16, 4); // 16x4
+ QUANTIZER_TRANSPOSE(16, 8); // 16x8
+ QUANTIZER_MEMCPY(16, 32); // 16x32
+ QUANTIZER_TRANSPOSE(32, 8); // 32x8
+ QUANTIZER_TRANSPOSE(32, 16); // 32x16
+ QUANTIZER_FILL_UPPER_TRIANGLE(32); // 32x32
+ // 16x16.
+ Array2DView<uint8_t> dst16x16(
+ 16, 16, quantizer_matrix[kTransformSize16x16].get());
+ Array2DView<const uint8_t> src32x32(
+ 32, 32, quantizer_matrix[kTransformSize32x32].get());
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ dst16x16[y][x] = src32x32[MultiplyBy2(y)][MultiplyBy2(x)];
+ }
+ }
+#undef QUANTIZER_FILL_UPPER_TRIANGLE
+#undef QUANTIZER_TRANSPOSE
+#undef QUANTIZER_MEMCPY
+ }
+ }
+ return true;
+}
+
int GetQIndex(const Segmentation& segmentation, int index, int base_qindex) {
if (segmentation.FeatureActive(index, kSegmentFeatureQuantizer)) {
const int segment_qindex =
diff --git a/libgav1/src/quantizer.h b/libgav1/src/quantizer.h
index e555115..00c53ab 100644
--- a/libgav1/src/quantizer.h
+++ b/libgav1/src/quantizer.h
@@ -20,11 +20,17 @@
#include <cstdint>
#include "src/utils/constants.h"
+#include "src/utils/dynamic_buffer.h"
#include "src/utils/segmentation.h"
#include "src/utils/types.h"
namespace libgav1 {
+using QuantizerMatrix = std::array<
+ std::array<std::array<DynamicBuffer<uint8_t>, kNumTransformSizes>,
+ kNumPlaneTypes>,
+ kNumQuantizerLevelsForQuantizerMatrix>;
+
// Implements the dequantization functions of Section 7.12.2.
class Quantizer {
public:
@@ -48,6 +54,9 @@
const int16_t* ac_lookup_;
};
+// Initialize the quantizer matrix.
+bool InitializeQuantizerMatrix(QuantizerMatrix* quantizer_matrix);
+
// Get the quantizer index for the |index|th segment.
//
// This function has two use cases. What should be passed as the |base_qindex|
diff --git a/libgav1/src/quantizer_tables.inc b/libgav1/src/quantizer_tables.inc
index b5a89a8..34342c4 100644
--- a/libgav1/src/quantizer_tables.inc
+++ b/libgav1/src/quantizer_tables.inc
@@ -15,6729 +15,3066 @@
// This file is just a convenience to separate out all the quantizer table
// definitions from the quantizer functions.
-// Quantizer matrix is used only when level < 15.
-constexpr int kNumQuantizerLevelsForQuantizerMatrix = 15;
-constexpr int kQuantizerMatrixSize = 3344;
-
-constexpr uint16_t kQuantizerMatrixOffset[kNumTransformSizes] = {
- 0, 1360, 2704, 1392, 16, 1424, 2832, 2768, 1552, 80,
- 1680, 1680, 3088, 2192, 336, 336, 2192, 336, 336};
-
-constexpr uint8_t kQuantizerMatrix
- [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
- [kQuantizerMatrixSize] = {
- // Quantizer level 0.
- {
- {// Luma
- // Size 4x4
- 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150,
- 200,
- // Size 8x8
- 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102,
- 38, 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121,
- 68, 63, 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159,
- 168, 95, 89, 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142,
- 168, 199, 220,
- // Size 16x16
- 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119,
- 31, 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112,
- 31, 32, 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107,
- 34, 33, 35, 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105,
- 36, 34, 36, 42, 48, 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105,
- 44, 41, 42, 47, 54, 63, 67, 75, 79, 90, 92, 95, 100, 102, 109, 112,
- 48, 44, 45, 51, 57, 67, 71, 80, 85, 96, 99, 107, 108, 111, 117,
- 120, 59, 54, 54, 58, 64, 75, 80, 92, 98, 110, 113, 115, 116, 122,
- 125, 130, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 127, 130,
- 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110, 118, 134, 137, 140,
- 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113, 121, 137, 140,
- 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107, 115, 127,
- 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100, 108,
- 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
- 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104,
- 100, 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210,
- 220, 119, 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179,
- 193, 210, 220, 231,
- // Size 32x32
- 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65,
- 71, 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123,
- 31, 32, 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62,
- 68, 76, 77, 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116,
- 31, 32, 32, 32, 32, 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59,
- 65, 72, 73, 75, 80, 83, 86, 90, 93, 97, 101, 104, 108, 112, 116,
- 31, 32, 32, 32, 33, 33, 34, 35, 35, 38, 41, 43, 45, 49, 54, 56, 59,
- 64, 72, 73, 74, 79, 82, 85, 88, 91, 94, 97, 101, 104, 107, 111, 31,
- 32, 32, 33, 33, 34, 35, 36, 36, 39, 42, 44, 45, 50, 54, 56, 59, 64,
- 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100, 104, 107, 111, 32, 32,
- 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49, 53, 55, 58, 63, 69,
- 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106, 109, 34, 34, 33,
- 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63, 68, 74, 75,
- 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34, 34, 35,
- 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77, 79,
- 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
- 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85,
- 88, 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40,
- 45, 47, 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92,
- 92, 95, 98, 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42,
- 47, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95,
- 97, 100, 101, 102, 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44,
- 49, 52, 55, 59, 65, 67, 69, 74, 78, 80, 82, 87, 93, 94, 95, 98,
- 100, 103, 102, 105, 108, 110, 111, 113, 117, 121, 48, 46, 44, 45,
- 45, 46, 51, 53, 57, 61, 67, 69, 71, 76, 80, 83, 85, 90, 96, 97, 99,
- 103, 107, 105, 108, 111, 111, 113, 117, 119, 120, 122, 54, 51, 49,
- 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82, 87, 89, 92, 97, 104,
- 105, 106, 111, 110, 111, 114, 113, 116, 120, 120, 121, 125, 130,
- 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92, 95, 98,
- 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125, 129,
- 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
- 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129,
- 131, 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82,
- 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130,
- 134, 133, 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73,
- 78, 84, 87, 90, 97, 103, 107, 111, 117, 125, 126, 128, 134, 132,
- 136, 133, 138, 137, 140, 143, 142, 145, 150, 80, 76, 72, 72, 71,
- 69, 74, 76, 79, 84, 90, 93, 96, 104, 110, 114, 118, 125, 134, 135,
- 137, 139, 140, 139, 143, 142, 144, 146, 146, 151, 152, 151, 81, 77,
- 73, 73, 72, 70, 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119,
- 126, 135, 137, 138, 144, 147, 146, 148, 149, 151, 150, 156, 155,
- 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106,
- 113, 117, 121, 128, 137, 138, 140, 147, 151, 156, 152, 157, 155,
- 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82, 85, 91,
- 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154, 158,
- 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
- 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151,
- 154, 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86,
- 85, 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136,
- 139, 146, 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183,
- 190, 97, 92, 90, 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114,
- 116, 125, 130, 133, 143, 148, 152, 163, 166, 168, 174, 176, 182,
- 187, 189, 188, 193, 191, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98,
- 101, 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171,
- 174, 176, 183, 184, 191, 195, 199, 197, 204, 104, 99, 97, 94, 93,
- 93, 90, 92, 96, 100, 102, 108, 111, 116, 122, 125, 134, 137, 144,
- 151, 155, 165, 169, 179, 182, 184, 191, 193, 200, 204, 210, 206,
- 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120,
- 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193,
- 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
- 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168,
- 173, 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108,
- 104, 104, 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131,
- 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204, 210, 212,
- 220, 222, 230, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110,
- 112, 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179,
- 183, 193, 197, 210, 214, 220, 222, 231, 232, 123, 116, 116, 111,
- 111, 109, 110, 107, 107, 114, 114, 121, 122, 130, 130, 140, 140,
- 150, 151, 163, 164, 176, 177, 190, 191, 204, 206, 222, 224, 230,
- 232, 242,
- // Size 4x8
- 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110,
- 65, 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112,
- 146, 190,
- // Size 8x4
- 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112,
- 75, 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152,
- 178, 190,
- // Size 8x16
- 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32,
- 34, 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38,
- 48, 60, 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56,
- 76, 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58,
- 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141, 135, 135,
- 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88, 106, 130, 148,
- 162, 159, 97, 86, 94, 107, 128, 157, 167, 171, 103, 93, 98, 114,
- 131, 150, 174, 186, 110, 100, 101, 117, 138, 161, 183, 193, 118,
- 107, 105, 118, 136, 157, 182, 203,
- // Size 16x8
- 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118,
- 32, 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107,
- 36, 34, 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105,
- 53, 49, 50, 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117,
- 118, 65, 59, 59, 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131,
- 138, 136, 87, 78, 77, 79, 84, 95, 102, 116, 124, 141, 144, 148,
- 157, 150, 161, 157, 93, 86, 82, 80, 86, 94, 105, 112, 122, 135,
- 149, 162, 167, 174, 183, 182, 99, 93, 89, 88, 90, 97, 105, 115,
- 124, 135, 146, 159, 171, 186, 193, 203,
- // Size 16x32
- 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102,
- 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31,
- 32, 33, 33, 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32,
- 33, 34, 35, 41, 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34,
- 35, 36, 42, 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37,
- 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42,
- 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92, 35, 34, 37, 41, 45, 50,
- 57, 61, 65, 76, 82, 83, 84, 84, 87, 90, 36, 34, 38, 43, 48, 54, 60,
- 64, 68, 78, 84, 87, 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69,
- 73, 84, 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75, 79,
- 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65, 73, 78, 82, 93,
- 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56, 67, 76, 80, 85, 96,
- 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60, 71, 82, 87, 92,
- 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63, 75, 87, 92,
- 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66, 77, 89,
- 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68, 79,
- 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
- 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70,
- 74, 79, 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81,
- 74, 71, 75, 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139,
- 142, 82, 75, 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149,
- 149, 146, 143, 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149,
- 153, 153, 152, 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130,
- 142, 148, 155, 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105,
- 118, 131, 137, 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94,
- 100, 107, 123, 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89,
- 87, 97, 100, 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103,
- 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186,
- 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177,
- 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138, 141,
- 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
- 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103,
- 105, 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122,
- 111, 111, 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203,
- 204, 217,
- // Size 32x16
- 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65,
- 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
- 31, 32, 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60,
- 65, 72, 74, 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32,
- 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63,
- 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34,
- 33, 34, 35, 37, 39, 41, 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74,
- 75, 76, 80, 81, 82, 85, 87, 90, 93, 97, 100, 103, 107, 36, 35, 34,
- 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80,
- 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 44, 42, 41,
- 41, 42, 42, 48, 50, 54, 58, 63, 65, 67, 71, 75, 77, 79, 84, 90, 91,
- 92, 97, 100, 100, 100, 100, 101, 104, 108, 112, 115, 119, 53, 51,
- 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97,
- 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118,
- 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87, 92,
- 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
- 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85,
- 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131,
- 136, 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84,
- 90, 93, 96, 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137,
- 140, 145, 144, 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79,
- 82, 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144,
- 149, 148, 153, 157, 152, 150, 155, 161, 159, 157, 156, 90, 85, 82,
- 81, 80, 78, 78, 83, 87, 89, 93, 100, 102, 107, 115, 118, 123, 132,
- 136, 140, 151, 153, 155, 160, 161, 164, 170, 168, 165, 167, 172,
- 178, 93, 88, 86, 84, 82, 82, 80, 84, 86, 91, 94, 98, 105, 107, 112,
- 119, 122, 130, 135, 140, 149, 153, 162, 165, 167, 173, 174, 177,
- 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83, 84, 89, 91, 95,
- 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152, 160, 167,
- 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89, 89, 88,
- 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
- 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97,
- 93, 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130,
- 131, 142, 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208,
- 217,
- // Size 4x16
- 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34,
- 54, 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60,
- 79, 118, 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142,
- 160, 88, 100, 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108,
- 115, 151, 197,
- // Size 16x4
- 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108,
- 44, 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108,
- 115, 79, 72, 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144,
- 141, 151, 96, 90, 86, 83, 89, 95, 102, 111, 123, 135, 149, 160,
- 173, 180, 188, 197,
- // Size 8x32
- 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31,
- 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34,
- 36, 50, 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42,
- 54, 63, 79, 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60,
- 68, 84, 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79,
- 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85,
- 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107, 58, 54, 63, 87,
- 98, 116, 112, 115, 61, 56, 66, 89, 101, 120, 119, 116, 65, 58, 68,
- 92, 105, 124, 122, 124, 71, 63, 73, 97, 111, 132, 130, 127, 79, 70,
- 79, 104, 118, 141, 135, 135, 81, 71, 80, 105, 119, 142, 140, 139,
- 82, 72, 81, 106, 121, 144, 149, 146, 88, 77, 85, 108, 126, 149,
- 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94, 83, 91, 105,
- 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171, 100, 89,
- 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174, 186,
- 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
- 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107,
- 105, 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179,
- 204,
- // Size 32x8
- 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65,
- 71, 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122,
- 32, 32, 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58,
- 63, 70, 71, 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36,
- 35, 34, 35, 36, 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73,
- 79, 80, 81, 85, 88, 91, 94, 97, 98, 100, 101, 103, 105, 107, 53,
- 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82, 87, 89, 92, 97,
- 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117, 117, 118,
- 119, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98,
- 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
- 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
- 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157,
- 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80, 84,
- 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
- 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90,
- 89, 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135,
- 139, 146, 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204},
- {// Chroma
- // Size 4x4
- 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
- // Size 8x8
- 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47,
- 46, 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52,
- 61, 72, 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67,
- 75, 86, 95, 104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
- // Size 16x16
- 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30,
- 32, 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35,
- 39, 45, 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45,
- 48, 50, 49, 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50,
- 53, 53, 54, 55, 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53,
- 58, 60, 62, 63, 67, 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60,
- 61, 65, 67, 71, 71, 74, 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65,
- 71, 73, 78, 79, 78, 77, 78, 78, 78, 57, 52, 51, 53, 56, 63, 67, 73,
- 76, 82, 83, 84, 84, 84, 82, 83, 63, 57, 56, 57, 60, 67, 71, 78, 82,
- 89, 90, 90, 89, 88, 87, 88, 65, 58, 57, 58, 61, 68, 71, 79, 83, 90,
- 91, 94, 93, 93, 92, 93, 68, 62, 60, 59, 64, 67, 74, 78, 84, 90, 94,
- 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69, 73, 77, 84, 89, 93, 99,
- 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73, 78, 84, 88, 93, 98,
- 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78, 82, 87, 92, 98,
- 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83, 88, 93, 98,
- 104, 109, 112, 116,
- // Size 32x32
- 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57,
- 60, 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31,
- 31, 33, 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60,
- 61, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33,
- 35, 40, 42, 44, 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58,
- 61, 62, 63, 64, 66, 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41,
- 43, 45, 47, 46, 45, 46, 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61,
- 62, 63, 64, 65, 66, 67, 68, 69, 70, 33, 34, 35, 37, 39, 43, 45, 46,
- 47, 46, 45, 46, 46, 47, 49, 50, 51, 53, 56, 57, 57, 59, 60, 61, 62,
- 63, 64, 65, 66, 68, 69, 70, 36, 38, 40, 41, 43, 47, 47, 47, 48, 46,
- 45, 46, 46, 47, 48, 49, 50, 52, 54, 55, 55, 57, 58, 59, 61, 62, 64,
- 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47, 48, 49, 50, 49, 49, 49,
- 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59, 60, 61, 61, 63, 64,
- 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51, 51, 51, 52, 52,
- 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63, 63, 64, 65,
- 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54, 55, 56,
- 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66, 48,
- 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
- 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45,
- 45, 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67,
- 68, 69, 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46,
- 46, 49, 51, 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70,
- 71, 71, 70, 70, 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50,
- 52, 54, 56, 60, 60, 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72,
- 73, 74, 73, 73, 74, 74, 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54,
- 57, 61, 62, 63, 66, 68, 69, 70, 72, 75, 75, 75, 77, 75, 75, 76, 75,
- 75, 76, 75, 75, 76, 77, 54, 52, 49, 49, 49, 48, 52, 53, 55, 58, 62,
- 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 77, 78, 78, 77,
- 78, 79, 78, 78, 56, 53, 51, 50, 50, 49, 52, 54, 56, 59, 63, 64, 66,
- 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80, 81, 81, 79, 81, 80, 79,
- 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73,
- 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82, 82, 84, 83, 82,
- 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75, 77, 79,
- 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86, 63, 60,
- 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85, 89,
- 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
- 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91,
- 92, 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55,
- 58, 60, 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94,
- 96, 93, 94, 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61,
- 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98,
- 95, 97, 95, 96, 95, 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66,
- 67, 71, 74, 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 98, 99, 100,
- 98, 99, 98, 98, 98, 97, 69, 65, 63, 62, 61, 59, 59, 62, 64, 65, 68,
- 71, 72, 75, 79, 80, 83, 87, 89, 92, 96, 97, 98, 100, 100, 101, 102,
- 101, 101, 101, 100, 102, 70, 66, 64, 63, 62, 61, 60, 63, 64, 66,
- 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98, 99, 100, 102, 102,
- 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64,
- 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102,
- 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61, 63,
- 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
- 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63,
- 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99,
- 101, 104, 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66,
- 66, 64, 63, 66, 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96,
- 98, 101, 104, 106, 108, 108, 111, 111, 112, 113, 75, 71, 71, 68,
- 68, 67, 66, 64, 66, 68, 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93,
- 95, 98, 101, 103, 107, 108, 110, 111, 113, 113, 115, 76, 72, 72,
- 69, 69, 68, 67, 65, 66, 69, 70, 72, 74, 76, 78, 81, 83, 85, 88, 90,
- 93, 95, 98, 100, 104, 105, 109, 111, 112, 113, 116, 115, 78, 74,
- 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87,
- 92, 92, 96, 97, 102, 102, 107, 107, 112, 113, 115, 115, 118,
- // Size 4x8
- 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54,
- 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
- // Size 8x4
- 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60,
- 54, 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
- // Size 8x16
- 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33,
- 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48,
- 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54,
- 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70,
- 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83,
- 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96,
- 99, 98, 72, 64, 66, 75, 83, 92, 101, 104, 74, 67, 66, 74, 84, 94,
- 103, 106, 76, 69, 67, 73, 82, 91, 101, 109,
- // Size 16x8
- 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37,
- 40, 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46,
- 47, 50, 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47,
- 50, 54, 61, 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53,
- 57, 64, 67, 73, 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62,
- 69, 73, 80, 84, 92, 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66,
- 72, 76, 80, 87, 93, 98, 99, 101, 103, 101, 71, 66, 63, 62, 62, 66,
- 70, 75, 79, 84, 89, 94, 98, 104, 106, 109,
- // Size 16x32
- 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31,
- 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32,
- 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41,
- 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45,
- 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47,
- 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49,
- 50, 52, 53, 57, 60, 58, 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52,
- 54, 55, 59, 61, 61, 61, 60, 61, 61, 49, 46, 48, 50, 53, 53, 54, 55,
- 57, 60, 62, 63, 62, 63, 62, 62, 48, 46, 47, 50, 53, 56, 57, 59, 60,
- 64, 66, 65, 65, 64, 64, 65, 49, 45, 46, 49, 53, 58, 61, 62, 64, 67,
- 69, 67, 66, 66, 66, 65, 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71,
- 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72,
- 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73,
- 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74,
- 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76,
- 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77,
- 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81, 63,
- 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
- 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56,
- 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60,
- 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64,
- 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70,
- 72, 79, 85, 88, 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73,
- 81, 83, 89, 96, 97, 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74,
- 79, 82, 90, 93, 98, 102, 102, 102, 101, 72, 65, 64, 62, 66, 68, 75,
- 78, 83, 89, 92, 100, 101, 103, 104, 102, 73, 66, 65, 63, 66, 69,
- 75, 76, 84, 87, 93, 98, 102, 105, 106, 107, 74, 67, 67, 64, 66, 70,
- 74, 77, 84, 86, 94, 96, 103, 105, 106, 107, 75, 68, 68, 65, 66, 71,
- 74, 78, 83, 87, 93, 96, 103, 105, 109, 109, 76, 69, 69, 66, 67, 72,
- 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77, 70, 70, 67, 67, 73,
- 73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
- // Size 32x16
- 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57,
- 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31,
- 32, 34, 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57,
- 58, 59, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41,
- 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56,
- 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47,
- 48, 49, 50, 50, 49, 49, 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60,
- 60, 60, 61, 62, 63, 64, 65, 66, 67, 48, 47, 46, 46, 47, 47, 50, 51,
- 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66,
- 67, 66, 66, 66, 66, 67, 67, 49, 47, 45, 45, 46, 45, 49, 51, 53, 56,
- 58, 59, 59, 61, 62, 63, 64, 65, 67, 68, 68, 69, 71, 70, 69, 68, 68,
- 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
- 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74,
- 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59, 62, 64, 65, 68,
- 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76, 77, 78, 80,
- 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
- 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 63,
- 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
- 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60,
- 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
- 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60,
- 58, 58, 61, 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96,
- 96, 98, 97, 98, 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60,
- 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98,
- 99, 99, 102, 101, 102, 103, 103, 101, 99, 69, 66, 65, 63, 62, 61,
- 60, 60, 63, 64, 66, 68, 70, 73, 74, 78, 80, 82, 85, 87, 91, 92, 96,
- 98, 101, 102, 103, 105, 105, 105, 107, 108, 71, 67, 66, 64, 63, 62,
- 62, 61, 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94,
- 97, 98, 102, 104, 106, 106, 109, 109, 108, 72, 68, 68, 65, 65, 63,
- 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77, 81, 81, 86, 87, 91, 91,
- 96, 97, 101, 102, 107, 107, 109, 110, 113,
- // Size 4x16
- 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46,
- 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64,
- 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89,
- 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
- // Size 16x4
- 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49,
- 45, 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57,
- 56, 57, 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62,
- 60, 63, 66, 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
- // Size 8x32
- 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30,
- 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43,
- 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50,
- 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54,
- 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64,
- 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73,
- 72, 70, 52, 47, 54, 66, 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76,
- 75, 55, 49, 56, 69, 75, 82, 79, 76, 57, 50, 56, 70, 76, 84, 80, 79,
- 60, 52, 58, 72, 79, 88, 84, 81, 63, 55, 60, 75, 82, 92, 87, 84, 64,
- 55, 61, 75, 82, 92, 89, 86, 64, 56, 61, 75, 83, 93, 93, 89, 67, 58,
- 63, 76, 85, 95, 94, 91, 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65,
- 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74,
- 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75,
- 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74,
- 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73,
- 81, 90, 99, 108,
- // Size 32x8
- 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57,
- 60, 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38,
- 40, 41, 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55,
- 55, 56, 58, 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46,
- 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61,
- 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47,
- 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74,
- 72, 73, 74, 75, 75, 74, 74, 73, 73, 57, 54, 52, 51, 51, 50, 53, 55,
- 57, 60, 64, 65, 67, 71, 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83,
- 82, 83, 84, 84, 83, 82, 81, 66, 63, 60, 59, 59, 57, 60, 61, 62, 66,
- 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95, 94, 95, 96, 93, 92,
- 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
- 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102,
- 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67,
- 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106,
- 106, 109, 109, 108},
- },
- // Quantizer level 1.
- {
- {// Luma
- // Size 4x4
- 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140,
- 184,
- // Size 8x8
- 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96,
- 37, 39, 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113,
- 62, 58, 71, 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148,
- 155, 90, 84, 93, 106, 129, 148, 169, 183, 102, 96, 100, 113, 132,
- 155, 183, 201,
- // Size 16x16
- 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111,
- 31, 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105,
- 31, 32, 33, 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100,
- 32, 33, 34, 36, 40, 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36,
- 34, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37,
- 39, 42, 50, 54, 60, 65, 70, 78, 84, 89, 95, 96, 102, 105, 47, 44,
- 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 102, 104, 109, 112, 54,
- 50, 50, 51, 60, 65, 75, 82, 89, 97, 104, 109, 110, 114, 117, 121,
- 61, 56, 56, 57, 65, 70, 81, 89, 97, 106, 113, 119, 122, 126, 125,
- 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117, 125, 131, 134, 134,
- 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125, 134, 140, 142,
- 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119, 131, 140,
- 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110, 122,
- 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
- 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95,
- 102, 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105,
- 100, 98, 98, 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
- // Size 32x32
- 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61,
- 65, 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114,
- 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58,
- 62, 68, 72, 76, 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31,
- 32, 32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60,
- 65, 70, 73, 76, 79, 82, 85, 88, 91, 95, 98, 101, 105, 109, 31, 32,
- 32, 32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 56, 59, 65,
- 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 31, 32, 32,
- 32, 33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 56, 59, 64, 68,
- 71, 74, 77, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 32, 32, 33,
- 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 55, 58, 63, 66, 69,
- 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32, 33, 33, 33, 34,
- 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65, 68, 71, 73,
- 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35, 37,
- 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
- 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42,
- 48, 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86,
- 88, 90, 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50,
- 51, 52, 56, 58, 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93,
- 94, 95, 96, 100, 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52,
- 54, 58, 60, 62, 65, 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96,
- 99, 102, 104, 105, 106, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58,
- 63, 66, 68, 71, 75, 77, 79, 84, 88, 90, 92, 95, 97, 97, 99, 102,
- 103, 103, 106, 109, 113, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
- 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 100, 102, 105,
- 104, 106, 109, 111, 112, 113, 49, 47, 46, 45, 46, 46, 48, 51, 57,
- 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98, 100, 103, 105, 107,
- 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49, 50, 49, 51, 54,
- 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104, 106, 109,
- 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54, 54,
- 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
- 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56,
- 56, 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106,
- 110, 113, 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130,
- 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98,
- 101, 105, 111, 115, 118, 121, 124, 128, 125, 129, 128, 131, 133,
- 132, 135, 139, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78, 84, 89,
- 92, 97, 103, 106, 111, 117, 122, 125, 128, 131, 131, 134, 132, 134,
- 136, 136, 140, 141, 140, 76, 72, 70, 69, 68, 66, 68, 71, 76, 79,
- 81, 88, 92, 95, 101, 107, 110, 115, 122, 127, 130, 133, 136, 136,
- 138, 139, 141, 140, 145, 143, 146, 151, 80, 76, 73, 72, 71, 69, 71,
- 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125, 130, 134, 137,
- 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78, 76, 75,
- 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128,
- 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
- 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116,
- 119, 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160,
- 162, 165, 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100,
- 105, 112, 114, 120, 128, 131, 136, 146, 147, 150, 155, 156, 161,
- 166, 165, 167, 169, 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86,
- 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152, 154,
- 156, 162, 163, 168, 173, 174, 174, 178, 176, 95, 90, 88, 86, 85,
- 84, 82, 86, 88, 93, 95, 99, 105, 106, 113, 118, 121, 129, 132, 139,
- 146, 148, 159, 161, 163, 169, 170, 176, 180, 183, 181, 187, 98, 93,
- 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104, 109, 114, 117, 126,
- 128, 134, 141, 145, 154, 157, 166, 168, 170, 176, 178, 184, 188,
- 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106,
- 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176,
- 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90, 95,
- 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156,
- 160, 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101,
- 98, 97, 96, 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130,
- 132, 140, 143, 150, 155, 162, 169, 174, 183, 188, 192, 194, 201,
- 202, 210, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105, 109,
- 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
- 181, 193, 196, 201, 202, 210, 211, 114, 109, 109, 104, 104, 102,
- 102, 99, 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140,
- 151, 151, 162, 162, 175, 176, 187, 188, 203, 204, 210, 211, 219,
- // Size 4x8
- 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105,
- 60, 79, 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136,
- 177,
- // Size 8x4
- 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105,
- 69, 64, 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167,
- 177,
- // Size 8x16
- 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32,
- 34, 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37,
- 48, 56, 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56,
- 69, 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56,
- 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79,
- 70, 79, 95, 118, 133, 142, 138, 86, 76, 84, 100, 124, 140, 153,
- 150, 92, 82, 89, 101, 121, 148, 157, 161, 98, 88, 93, 108, 124,
- 141, 163, 174, 104, 94, 95, 110, 129, 151, 171, 181, 110, 100, 98,
- 111, 127, 147, 169, 188,
- // Size 16x8
- 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110,
- 32, 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100,
- 36, 35, 36, 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47,
- 44, 45, 47, 56, 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111,
- 65, 60, 59, 60, 68, 73, 84, 92, 100, 111, 118, 124, 121, 124, 129,
- 127, 79, 72, 71, 71, 78, 84, 95, 103, 113, 125, 133, 140, 148, 141,
- 151, 147, 90, 84, 80, 78, 83, 91, 101, 108, 116, 129, 142, 153,
- 157, 163, 171, 169, 96, 90, 87, 85, 87, 94, 101, 110, 118, 129,
- 138, 150, 161, 174, 181, 188,
- // Size 16x32
- 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31,
- 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32,
- 32, 33, 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33,
- 33, 35, 41, 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34,
- 36, 42, 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38,
- 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44,
- 47, 51, 60, 66, 71, 76, 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50,
- 54, 63, 69, 73, 80, 82, 81, 84, 86, 36, 34, 37, 40, 48, 54, 56, 60,
- 68, 74, 78, 84, 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71,
- 77, 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65, 73, 79,
- 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63, 66, 71, 79, 85, 90,
- 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66, 69, 75, 84, 90, 95, 99,
- 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71, 77, 86, 93, 97, 103,
- 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103,
- 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105,
- 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100,
- 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
- 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89,
- 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88,
- 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71,
- 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136, 82, 75,
- 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144, 144, 145,
- 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147, 153, 151,
- 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130, 145, 152, 156,
- 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116, 121, 132, 148,
- 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95, 105, 114, 120,
- 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85, 93, 95, 108,
- 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91, 88, 94,
- 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95, 94,
- 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
- 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188,
- 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169,
- 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145,
- 145, 166, 166, 189, 190, 201,
- // Size 32x16
- 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61,
- 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
- 31, 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56,
- 60, 65, 69, 72, 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32,
- 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59,
- 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 32, 33,
- 33, 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 57, 60, 65,
- 69, 71, 74, 77, 78, 80, 83, 85, 88, 91, 94, 97, 100, 36, 35, 35,
- 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76,
- 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 44, 42, 41, 41,
- 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77, 79, 84, 88, 90,
- 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45, 44, 44, 45,
- 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97,
- 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
- 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101,
- 104, 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62,
- 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105,
- 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128, 127,
- 127, 73, 69, 67, 66, 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99,
- 105, 107, 112, 119, 123, 127, 130, 133, 130, 132, 136, 136, 133,
- 132, 136, 141, 145, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
- 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143,
- 141, 146, 151, 149, 147, 145, 87, 83, 80, 79, 78, 76, 76, 80, 84,
- 86, 90, 96, 99, 103, 111, 114, 118, 126, 130, 134, 143, 146, 147,
- 152, 151, 155, 160, 158, 154, 156, 161, 166, 90, 86, 84, 82, 80,
- 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116, 124, 129, 134,
- 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169, 166, 93, 88,
- 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113, 117,
- 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
- 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102,
- 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174,
- 179, 181, 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93,
- 93, 99, 99, 106, 107, 115, 116, 124, 125, 135, 136, 145, 146, 158,
- 159, 170, 171, 185, 186, 192, 193, 201,
- // Size 4x16
- 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34,
- 54, 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56,
- 77, 107, 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151,
- 84, 95, 132, 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108,
- 141, 183,
- // Size 16x4
- 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101,
- 44, 41, 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108,
- 73, 67, 65, 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132,
- 141, 93, 87, 83, 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169,
- 175, 183,
- // Size 8x32
- 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31,
- 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34,
- 36, 45, 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40,
- 47, 60, 71, 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56,
- 68, 78, 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73,
- 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95,
- 101, 101, 49, 47, 57, 71, 86, 97, 103, 102, 53, 50, 60, 75, 92,
- 103, 108, 110, 58, 54, 63, 79, 98, 110, 114, 111, 61, 56, 65, 81,
- 100, 113, 116, 118, 65, 59, 68, 84, 105, 118, 124, 121, 71, 64, 73,
- 89, 111, 125, 129, 129, 76, 68, 76, 92, 115, 130, 134, 132, 79, 70,
- 79, 95, 118, 133, 142, 138, 82, 73, 81, 97, 121, 136, 145, 144, 86,
- 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99, 124, 145, 156,
- 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92, 105, 120,
- 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91, 94,
- 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
- 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169,
- 188, 114, 104, 100, 111, 127, 145, 166, 190,
- // Size 32x8
- 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61,
- 65, 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114,
- 32, 32, 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56,
- 59, 64, 68, 70, 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36,
- 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68,
- 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98, 100, 47, 45,
- 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89,
- 92, 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 65,
- 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100,
- 105, 111, 115, 118, 121, 124, 124, 121, 120, 124, 128, 129, 128,
- 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97,
- 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
- 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88,
- 91, 94, 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156,
- 157, 163, 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85,
- 84, 87, 90, 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138,
- 144, 150, 156, 161, 171, 174, 179, 181, 188, 188, 190},
- {// Chroma
- // Size 4x4
- 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
- // Size 8x8
- 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47,
- 47, 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50,
- 58, 67, 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65,
- 73, 84, 92, 101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
- // Size 16x16
- 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30,
- 31, 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35,
- 39, 44, 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44,
- 47, 49, 48, 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49,
- 53, 53, 54, 54, 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53,
- 54, 56, 57, 59, 61, 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56,
- 61, 63, 65, 68, 70, 72, 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63,
- 66, 69, 72, 75, 76, 75, 76, 76, 76, 55, 51, 50, 50, 56, 59, 65, 69,
- 73, 77, 79, 81, 81, 81, 80, 80, 60, 55, 53, 53, 58, 61, 68, 72, 77,
- 82, 85, 87, 87, 85, 84, 85, 63, 58, 56, 55, 60, 63, 70, 75, 79, 85,
- 89, 91, 91, 90, 89, 90, 66, 60, 58, 58, 62, 65, 72, 76, 81, 87, 91,
- 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67, 71, 75, 81, 87, 91, 96,
- 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71, 76, 81, 85, 90, 95,
- 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76, 80, 84, 89, 95,
- 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80, 85, 90, 95,
- 100, 105, 108, 111,
- // Size 32x32
- 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55,
- 57, 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31,
- 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57,
- 59, 60, 61, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32,
- 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58,
- 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40,
- 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59,
- 61, 62, 63, 63, 64, 65, 66, 67, 68, 33, 34, 35, 36, 39, 43, 44, 45,
- 47, 46, 46, 45, 46, 47, 47, 49, 50, 51, 53, 55, 56, 57, 58, 59, 60,
- 61, 62, 63, 65, 66, 67, 68, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
- 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 58, 59, 61, 62,
- 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
- 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58, 58, 59, 60, 61, 62,
- 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
- 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61, 62, 63, 63,
- 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
- 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64, 49,
- 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
- 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46,
- 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63,
- 63, 64, 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45,
- 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68,
- 69, 69, 68, 68, 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47,
- 50, 54, 55, 56, 59, 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71,
- 71, 72, 71, 71, 72, 72, 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54,
- 55, 56, 60, 61, 62, 64, 66, 66, 67, 69, 70, 71, 72, 73, 73, 74, 73,
- 73, 74, 73, 73, 74, 75, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57,
- 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 77, 75, 76, 76, 75,
- 76, 77, 76, 75, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64,
- 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78, 79, 78, 77, 78, 78, 77,
- 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66, 69,
- 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80, 80, 81, 80, 79,
- 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67, 70, 73, 74,
- 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83, 60, 57,
- 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79, 82,
- 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
- 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87,
- 88, 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54,
- 55, 57, 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91,
- 93, 91, 91, 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58,
- 61, 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95,
- 92, 94, 92, 93, 92, 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64,
- 65, 69, 72, 73, 76, 80, 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95,
- 95, 95, 95, 95, 93, 67, 64, 62, 61, 59, 58, 58, 60, 63, 64, 66, 69,
- 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97, 97, 98, 99, 97, 97,
- 97, 96, 98, 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74,
- 75, 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 98, 100, 100, 100, 99,
- 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62, 65, 66, 68, 72, 73,
- 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101, 100, 102, 102,
- 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65, 66, 69, 71,
- 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103, 102,
- 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
- 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105,
- 104, 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69,
- 72, 73, 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104,
- 107, 106, 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69,
- 72, 73, 77, 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106,
- 106, 109, 108, 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70,
- 72, 74, 76, 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106,
- 108, 108, 111, 110, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71,
- 71, 75, 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108,
- 108, 110, 110, 113,
- // Size 4x8
- 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52,
- 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
- // Size 8x4
- 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57,
- 52, 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
- // Size 8x16
- 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33,
- 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48,
- 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54,
- 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65,
- 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82,
- 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94,
- 97, 96, 70, 62, 65, 73, 81, 89, 98, 101, 72, 65, 65, 72, 82, 92,
- 100, 103, 74, 67, 65, 71, 79, 89, 98, 105,
- // Size 16x8
- 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35,
- 38, 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46,
- 47, 48, 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46,
- 47, 54, 56, 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51,
- 57, 60, 66, 71, 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60,
- 64, 70, 75, 79, 85, 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65,
- 71, 75, 79, 85, 91, 95, 97, 98, 100, 98, 70, 65, 63, 61, 61, 65,
- 69, 74, 78, 82, 87, 91, 96, 101, 103, 105,
- // Size 16x32
- 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31,
- 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32,
- 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38,
- 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44,
- 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47,
- 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47,
- 47, 48, 51, 54, 55, 57, 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50,
- 50, 53, 56, 57, 60, 60, 59, 60, 60, 49, 46, 48, 49, 53, 53, 54, 54,
- 57, 59, 60, 63, 61, 62, 61, 61, 48, 46, 47, 48, 53, 55, 55, 56, 58,
- 61, 62, 64, 64, 63, 63, 64, 48, 46, 46, 48, 53, 56, 56, 57, 60, 62,
- 64, 66, 65, 65, 65, 64, 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67,
- 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71,
- 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72,
- 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73,
- 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74,
- 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75,
- 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79, 60,
- 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
- 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55,
- 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57,
- 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62,
- 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68,
- 71, 78, 83, 86, 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71,
- 79, 81, 86, 94, 95, 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77,
- 80, 88, 91, 96, 99, 99, 100, 98, 70, 64, 62, 60, 65, 66, 73, 76,
- 81, 87, 89, 97, 98, 100, 101, 99, 71, 65, 64, 61, 65, 67, 73, 74,
- 82, 85, 90, 95, 99, 102, 103, 104, 72, 65, 65, 62, 65, 68, 72, 75,
- 82, 83, 92, 93, 100, 102, 103, 104, 73, 66, 66, 63, 65, 69, 72, 76,
- 81, 85, 90, 93, 100, 102, 105, 106, 74, 67, 67, 64, 65, 70, 71, 77,
- 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68, 65, 65, 71, 71, 78,
- 78, 87, 87, 96, 96, 105, 105, 109,
- // Size 32x16
- 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55,
- 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31,
- 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54,
- 56, 57, 59, 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38,
- 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55,
- 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47,
- 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58,
- 58, 59, 60, 60, 61, 62, 63, 64, 65, 48, 47, 46, 46, 47, 47, 48, 50,
- 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64,
- 65, 65, 65, 65, 65, 65, 65, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
- 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 68, 67, 66, 66,
- 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
- 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72,
- 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
- 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74, 75, 76, 77,
- 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
- 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 61,
- 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
- 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58,
- 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
- 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59,
- 57, 57, 60, 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93,
- 94, 96, 95, 96, 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58,
- 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
- 97, 99, 98, 99, 100, 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59,
- 62, 63, 65, 67, 68, 72, 73, 76, 78, 81, 84, 85, 89, 90, 93, 96, 98,
- 99, 100, 102, 102, 102, 103, 105, 70, 66, 65, 63, 63, 62, 61, 60,
- 61, 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96,
- 100, 101, 103, 103, 105, 105, 105, 71, 67, 67, 64, 64, 62, 62, 60,
- 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80, 84, 84, 89, 89, 94, 94,
- 98, 99, 104, 104, 106, 106, 109,
- // Size 4x16
- 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46,
- 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63,
- 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86,
- 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
- // Size 16x4
- 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49,
- 45, 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55,
- 54, 54, 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61,
- 59, 62, 65, 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
- // Size 8x32
- 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30,
- 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41,
- 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48,
- 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54,
- 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60,
- 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70,
- 71, 69, 51, 47, 54, 61, 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75,
- 74, 54, 49, 55, 65, 73, 78, 78, 74, 55, 49, 56, 65, 74, 79, 79, 78,
- 57, 50, 56, 66, 76, 82, 83, 79, 60, 53, 58, 68, 79, 85, 85, 82, 62,
- 54, 60, 69, 81, 87, 87, 84, 63, 55, 60, 70, 82, 89, 91, 87, 64, 56,
- 61, 71, 83, 90, 92, 89, 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63,
- 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72,
- 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73,
- 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72,
- 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71,
- 78, 87, 96, 105,
- // Size 32x8
- 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55,
- 57, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37,
- 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53,
- 54, 55, 56, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46,
- 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60,
- 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46,
- 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72,
- 71, 71, 72, 73, 73, 72, 72, 71, 71, 57, 54, 52, 52, 51, 50, 51, 53,
- 57, 58, 60, 64, 66, 68, 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81,
- 80, 81, 82, 82, 81, 79, 78, 63, 60, 58, 57, 56, 54, 55, 57, 60, 62,
- 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 94, 91, 89,
- 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
- 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99,
- 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66,
- 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103,
- 103, 105, 105, 105},
- },
- // Quantizer level 2.
- {
- {// Luma
- // Size 4x4
- 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
- // Size 8x8
- 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35,
- 37, 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57,
- 54, 64, 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144,
- 85, 79, 87, 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144,
- 168, 184,
- // Size 16x16
- 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104,
- 31, 32, 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31,
- 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32,
- 34, 35, 37, 40, 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36,
- 37, 42, 47, 50, 53, 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40,
- 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50,
- 58, 63, 68, 74, 79, 84, 91, 96, 98, 102, 104, 49, 46, 46, 46, 53,
- 62, 68, 73, 81, 87, 92, 99, 103, 107, 109, 112, 58, 54, 53, 52, 59,
- 68, 74, 81, 90, 97, 102, 110, 114, 118, 117, 121, 65, 60, 59, 58,
- 65, 73, 79, 87, 97, 105, 111, 120, 125, 125, 126, 130, 71, 66, 64,
- 63, 70, 78, 84, 92, 102, 111, 117, 127, 133, 134, 136, 141, 81, 75,
- 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143, 145, 148, 152, 87,
- 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150, 156, 160,
- 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156, 163,
- 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
- 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130,
- 141, 152, 163, 177, 184, 191,
- // Size 32x32
- 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58,
- 59, 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31,
- 32, 32, 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57,
- 62, 66, 68, 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60,
- 64, 66, 73, 75, 76, 81, 83, 86, 89, 92, 95, 98, 101, 31, 32, 32,
- 32, 32, 32, 32, 33, 34, 34, 37, 38, 41, 44, 45, 49, 53, 54, 59, 63,
- 65, 72, 74, 75, 79, 81, 84, 86, 89, 91, 94, 97, 31, 32, 32, 32, 33,
- 33, 34, 35, 36, 36, 39, 40, 42, 45, 46, 50, 53, 54, 59, 63, 64, 71,
- 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 32, 33, 34, 34,
- 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58, 62, 63, 70, 72, 73,
- 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34, 34, 35, 37, 37,
- 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71, 72, 75, 78,
- 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41, 42, 45,
- 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82, 83,
- 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
- 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90,
- 92, 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57,
- 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99,
- 39, 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68,
- 69, 73, 76, 78, 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39,
- 39, 38, 40, 40, 41, 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75,
- 78, 80, 86, 87, 88, 91, 93, 96, 97, 97, 99, 102, 105, 44, 42, 41,
- 41, 42, 42, 42, 47, 50, 54, 58, 59, 63, 66, 68, 71, 74, 75, 79, 83,
- 84, 90, 91, 92, 96, 98, 98, 99, 102, 104, 104, 105, 48, 46, 45, 44,
- 45, 45, 45, 50, 53, 56, 61, 62, 66, 70, 71, 76, 79, 80, 85, 88, 90,
- 96, 97, 98, 101, 100, 102, 105, 105, 105, 109, 112, 49, 47, 46, 45,
- 46, 46, 46, 51, 53, 57, 62, 63, 68, 71, 73, 77, 81, 82, 87, 90, 92,
- 98, 99, 100, 103, 106, 107, 106, 109, 112, 112, 112, 54, 51, 50,
- 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 86, 87, 92, 96,
- 97, 104, 105, 106, 110, 110, 109, 113, 114, 113, 116, 120, 58, 55,
- 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97,
- 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121, 121, 120,
- 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82, 87, 91,
- 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122, 125,
- 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
- 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126,
- 130, 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83,
- 88, 90, 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131,
- 130, 134, 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73,
- 78, 80, 84, 90, 92, 97, 102, 104, 111, 115, 117, 125, 127, 128,
- 133, 136, 134, 139, 136, 139, 141, 140, 80, 76, 73, 72, 71, 70, 69,
- 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125, 134,
- 136, 137, 142, 138, 143, 140, 144, 144, 144, 149, 81, 77, 75, 74,
- 73, 72, 71, 75, 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 120,
- 125, 127, 136, 137, 139, 143, 148, 145, 148, 148, 150, 152, 149,
- 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106,
- 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153, 153, 154,
- 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91, 96,
- 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
- 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83,
- 87, 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148,
- 149, 151, 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83,
- 82, 80, 82, 85, 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131,
- 134, 143, 145, 153, 156, 157, 163, 164, 169, 172, 177, 172, 95, 90,
- 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105, 106, 113, 116,
- 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169, 170, 176,
- 179, 185, 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105,
- 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169,
- 170, 176, 177, 184, 186, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93,
- 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155,
- 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94, 94, 92,
- 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
- 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107,
- 101, 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120,
- 120, 129, 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186,
- 191, 192, 199,
- // Size 4x8
- 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101,
- 54, 69, 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127,
- 165,
- // Size 8x4
- 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62,
- 58, 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
- // Size 8x16
- 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32,
- 33, 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36,
- 44, 50, 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53,
- 63, 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63,
- 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64,
- 73, 84, 102, 125, 135, 131, 81, 72, 80, 91, 110, 135, 145, 141, 87,
- 77, 85, 96, 114, 140, 148, 151, 92, 83, 88, 102, 117, 133, 153,
- 163, 98, 88, 89, 103, 121, 141, 160, 169, 103, 94, 92, 103, 119,
- 137, 158, 175,
- // Size 16x8
- 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103,
- 32, 32, 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36,
- 35, 36, 38, 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41,
- 42, 42, 50, 58, 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54,
- 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114, 117, 121, 119, 79,
- 73, 71, 69, 75, 84, 90, 97, 108, 118, 125, 135, 140, 133, 141, 137,
- 88, 81, 78, 76, 81, 88, 97, 104, 111, 123, 135, 145, 148, 153, 160,
- 158, 93, 88, 84, 82, 84, 90, 97, 105, 113, 122, 131, 141, 151, 163,
- 169, 175,
- // Size 16x32
- 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31,
- 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32,
- 32, 32, 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32,
- 33, 34, 37, 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34,
- 36, 39, 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37,
- 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40,
- 42, 49, 52, 58, 69, 70, 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48,
- 54, 57, 63, 73, 75, 79, 79, 81, 83, 34, 34, 36, 37, 44, 47, 50, 56,
- 59, 65, 75, 77, 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63,
- 68, 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65, 68, 73,
- 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55, 59, 67, 70, 75, 85,
- 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58, 63, 71, 74, 79, 90, 91,
- 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99,
- 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104,
- 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105,
- 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97, 108,
- 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
- 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97,
- 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96,
- 100, 109, 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84,
- 97, 102, 111, 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84,
- 90, 104, 109, 118, 133, 135, 137, 136, 136, 137, 81, 74, 72, 71,
- 80, 85, 91, 105, 110, 120, 135, 137, 145, 143, 141, 138, 82, 75,
- 73, 72, 81, 86, 92, 106, 111, 121, 136, 139, 147, 148, 147, 149,
- 87, 79, 77, 76, 85, 90, 96, 110, 114, 125, 140, 143, 148, 154, 151,
- 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129, 135, 146, 153, 157,
- 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117, 128, 133, 150,
- 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105, 120, 125,
- 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103, 108,
- 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
- 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90,
- 92, 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97,
- 97, 93, 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
- // Size 32x16
- 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57,
- 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31,
- 32, 32, 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54,
- 60, 63, 65, 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32,
- 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62,
- 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34,
- 34, 35, 37, 37, 38, 40, 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70,
- 71, 72, 76, 78, 81, 83, 85, 88, 90, 93, 36, 35, 35, 34, 36, 37, 38,
- 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81,
- 85, 87, 88, 88, 89, 90, 92, 93, 39, 38, 38, 37, 39, 40, 40, 45, 47,
- 51, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78, 84, 85, 86, 90, 89,
- 90, 92, 95, 98, 101, 104, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54,
- 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99,
- 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49, 49, 54, 56,
- 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105, 106, 110,
- 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52, 57,
- 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110,
- 111, 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59,
- 58, 58, 63, 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111,
- 118, 120, 121, 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73,
- 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118,
- 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
- 81, 77, 75, 74, 72, 71, 70, 75, 77, 80, 85, 87, 91, 97, 99, 105,
- 110, 112, 119, 124, 127, 135, 137, 139, 143, 146, 150, 148, 144,
- 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
- 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153,
- 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78, 79, 83,
- 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136, 143,
- 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
- 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125,
- 131, 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91,
- 91, 87, 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110,
- 118, 119, 128, 129, 137, 138, 149, 149, 159, 160, 173, 174, 179,
- 180, 187,
- // Size 4x16
- 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34,
- 47, 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53,
- 68, 97, 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143,
- 79, 90, 125, 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131,
- 170,
- // Size 16x4
- 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39,
- 38, 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65,
- 60, 59, 58, 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131,
- 90, 84, 81, 78, 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164,
- 170,
- // Size 8x32
- 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31,
- 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33,
- 36, 42, 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38,
- 42, 52, 69, 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50,
- 59, 75, 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68,
- 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90,
- 97, 97, 47, 45, 56, 66, 79, 95, 99, 98, 49, 46, 57, 67, 81, 97,
- 104, 105, 53, 50, 60, 71, 86, 103, 109, 106, 57, 53, 63, 74, 90,
- 108, 111, 113, 59, 54, 64, 75, 91, 111, 119, 115, 65, 59, 68, 79,
- 97, 118, 123, 122, 69, 62, 71, 83, 100, 122, 127, 125, 71, 64, 73,
- 84, 102, 125, 135, 131, 79, 71, 79, 90, 109, 133, 137, 136, 81, 72,
- 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87,
- 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160,
- 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137,
- 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103,
- 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
- 93, 104, 118, 135, 155, 176,
- // Size 32x8
- 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57,
- 59, 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32,
- 32, 32, 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54,
- 59, 62, 64, 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35,
- 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71,
- 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42,
- 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
- 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 58, 55, 54, 53, 53,
- 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102,
- 109, 110, 111, 114, 113, 117, 120, 121, 120, 119, 118, 79, 75, 73,
- 72, 71, 70, 69, 73, 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118,
- 122, 125, 133, 135, 136, 140, 135, 133, 137, 141, 139, 137, 135,
- 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97, 99, 104, 109,
- 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153, 155, 160,
- 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92, 97,
- 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160,
- 163, 168, 169, 175, 175, 176},
- {// Chroma
- // Size 4x4
- 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
- // Size 8x8
- 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47,
- 47, 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49,
- 55, 65, 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63,
- 71, 81, 89, 97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
- // Size 16x16
- 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30,
- 31, 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35,
- 39, 43, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43,
- 47, 47, 46, 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47,
- 50, 51, 51, 51, 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51,
- 54, 55, 56, 58, 60, 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55,
- 58, 60, 62, 63, 65, 68, 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60,
- 62, 65, 67, 69, 72, 73, 74, 73, 73, 54, 50, 49, 48, 53, 58, 62, 65,
- 70, 73, 75, 78, 79, 79, 77, 77, 57, 53, 51, 50, 54, 60, 63, 67, 73,
- 76, 79, 82, 84, 83, 82, 82, 60, 55, 53, 52, 56, 61, 65, 69, 75, 79,
- 82, 86, 88, 87, 86, 87, 64, 59, 57, 55, 59, 64, 68, 72, 78, 82, 86,
- 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65, 69, 73, 79, 84, 88, 93,
- 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74, 79, 83, 87, 92, 96,
- 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77, 82, 86, 91, 96,
- 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82, 87, 92, 96,
- 101, 104, 106,
- // Size 32x32
- 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54,
- 55, 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31,
- 31, 31, 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54,
- 56, 57, 60, 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31,
- 35, 37, 39, 42, 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55,
- 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37,
- 40, 42, 44, 46, 45, 45, 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58,
- 58, 60, 61, 62, 63, 63, 64, 65, 66, 33, 34, 35, 35, 39, 41, 43, 45,
- 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 53, 53, 56, 57, 57, 59,
- 60, 61, 62, 63, 64, 65, 66, 35, 36, 37, 37, 41, 43, 45, 46, 46, 47,
- 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53, 55, 56, 56, 58, 59, 60,
- 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45, 47, 47, 47, 48, 46, 46,
- 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55, 57, 58, 58, 59, 61,
- 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50, 49, 49, 49, 50,
- 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59, 60, 61, 61,
- 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51, 51, 52,
- 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62, 49,
- 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
- 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46,
- 45, 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61,
- 61, 63, 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46,
- 46, 46, 49, 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65,
- 65, 65, 66, 67, 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45,
- 49, 51, 53, 55, 56, 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68,
- 69, 70, 69, 69, 69, 70, 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51,
- 54, 56, 57, 59, 61, 62, 63, 64, 65, 66, 68, 68, 70, 71, 71, 72, 71,
- 71, 72, 71, 71, 71, 72, 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56,
- 57, 60, 62, 62, 64, 65, 66, 67, 69, 69, 71, 72, 72, 73, 74, 74, 72,
- 73, 74, 73, 73, 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61,
- 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 76, 75, 76, 76, 74,
- 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64, 65,
- 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78, 77, 78, 77, 77,
- 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65, 66, 68, 70,
- 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81, 57, 54,
- 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73, 76,
- 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
- 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81,
- 84, 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53,
- 52, 55, 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86,
- 86, 88, 88, 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57,
- 59, 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92,
- 89, 91, 89, 90, 89, 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61,
- 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92,
- 92, 91, 91, 92, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59, 61, 64, 65,
- 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91, 93, 94, 95, 94, 94,
- 94, 93, 94, 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72,
- 73, 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 95, 96, 97, 96, 95, 96,
- 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76,
- 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98, 99, 99, 97, 99, 68,
- 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71, 74, 75, 79, 80,
- 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101, 99, 69, 66,
- 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78, 80, 83,
- 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
- 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82,
- 85, 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67,
- 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83,
- 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68,
- 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82,
- 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104, 106, 106, 73, 69,
- 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81,
- 85, 85, 89, 90, 94, 94, 99, 99, 104, 104, 106, 106, 108,
- // Size 4x8
- 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50,
- 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
- // Size 8x4
- 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54,
- 50, 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
- // Size 8x16
- 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33,
- 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46,
- 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53,
- 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62,
- 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75,
- 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91,
- 94, 93, 68, 61, 63, 71, 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97,
- 100, 72, 65, 63, 69, 77, 86, 95, 102,
- // Size 16x8
- 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34,
- 36, 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46,
- 47, 47, 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46,
- 45, 51, 56, 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48,
- 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59,
- 64, 67, 71, 77, 82, 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64,
- 70, 73, 77, 83, 89, 93, 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68,
- 72, 76, 81, 85, 89, 93, 98, 100, 102,
- // Size 16x32
- 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31,
- 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32,
- 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36,
- 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43,
- 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47,
- 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46,
- 45, 47, 48, 50, 54, 55, 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49,
- 50, 51, 53, 57, 58, 59, 58, 59, 59, 44, 44, 46, 47, 51, 51, 51, 52,
- 53, 54, 59, 59, 60, 61, 61, 60, 49, 46, 47, 48, 53, 53, 53, 54, 55,
- 57, 60, 61, 63, 62, 62, 63, 48, 46, 46, 47, 53, 54, 56, 57, 58, 60,
- 64, 64, 64, 64, 64, 63, 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65,
- 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68,
- 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70,
- 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71,
- 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73,
- 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74,
- 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78, 57,
- 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
- 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53,
- 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55,
- 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61,
- 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65,
- 68, 75, 78, 83, 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69,
- 77, 79, 84, 91, 93, 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75,
- 78, 85, 88, 93, 96, 97, 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79,
- 84, 87, 94, 96, 97, 98, 96, 69, 63, 62, 60, 63, 65, 71, 72, 80, 82,
- 88, 93, 96, 99, 100, 101, 70, 64, 63, 60, 63, 66, 70, 73, 80, 81,
- 89, 90, 97, 99, 100, 101, 71, 65, 64, 61, 63, 67, 70, 74, 78, 82,
- 88, 90, 97, 99, 102, 103, 72, 65, 65, 62, 63, 68, 69, 75, 77, 83,
- 86, 92, 95, 100, 102, 103, 73, 66, 66, 63, 63, 69, 69, 76, 76, 84,
- 84, 93, 93, 101, 101, 105,
- // Size 32x16
- 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54,
- 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31,
- 32, 32, 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52,
- 54, 54, 57, 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36,
- 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53,
- 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45,
- 47, 47, 47, 48, 47, 46, 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55,
- 56, 57, 58, 59, 60, 60, 61, 62, 63, 48, 47, 46, 46, 47, 47, 47, 50,
- 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63,
- 63, 63, 63, 63, 63, 63, 63, 48, 47, 46, 45, 46, 46, 46, 50, 51, 53,
- 54, 55, 56, 56, 57, 57, 58, 59, 60, 61, 62, 64, 64, 65, 66, 65, 64,
- 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
- 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70,
- 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63,
- 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72, 73, 74, 75,
- 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
- 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 57,
- 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
- 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58,
- 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
- 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57,
- 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90,
- 90, 91, 93, 93, 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57,
- 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
- 94, 96, 96, 96, 97, 97, 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61,
- 62, 64, 66, 67, 71, 71, 75, 77, 79, 82, 83, 87, 88, 91, 93, 95, 97,
- 97, 99, 99, 99, 100, 101, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
- 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98,
- 100, 100, 102, 102, 101, 69, 66, 66, 63, 63, 61, 61, 59, 60, 63,
- 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86, 87, 91, 91, 95, 96,
- 101, 101, 103, 103, 105,
- // Size 4x16
- 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44,
- 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58,
- 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84,
- 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
- // Size 16x4
- 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48,
- 46, 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53,
- 51, 50, 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60,
- 58, 61, 64, 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
- // Size 8x32
- 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31,
- 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40,
- 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47,
- 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51,
- 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58,
- 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67,
- 70, 68, 50, 46, 54, 59, 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73,
- 72, 52, 47, 54, 61, 68, 75, 76, 73, 54, 49, 55, 62, 70, 77, 77, 76,
- 54, 49, 55, 62, 70, 78, 81, 77, 57, 51, 56, 64, 73, 82, 83, 81, 59,
- 52, 58, 65, 74, 84, 85, 82, 60, 53, 58, 65, 75, 85, 89, 85, 63, 56,
- 60, 67, 77, 89, 90, 87, 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61,
- 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70,
- 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80,
- 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78,
- 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76,
- 84, 93, 101,
- // Size 32x8
- 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54,
- 54, 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35,
- 36, 36, 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51,
- 52, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46,
- 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58,
- 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45,
- 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68,
- 68, 69, 70, 71, 71, 70, 70, 69, 69, 54, 51, 50, 49, 49, 48, 48, 51,
- 53, 55, 58, 59, 62, 65, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79,
- 78, 79, 80, 80, 78, 77, 76, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60,
- 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 91, 88, 87,
- 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
- 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97,
- 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68,
- 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102,
- 102, 101},
- },
- // Quantizer level 3.
- {
- {// Luma
- // Size 4x4
- 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
- // Size 8x8
- 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35,
- 36, 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51,
- 60, 73, 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78,
- 74, 80, 92, 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
- // Size 16x16
- 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31,
- 32, 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32,
- 33, 33, 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33,
- 35, 37, 38, 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37,
- 39, 42, 46, 50, 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42,
- 48, 52, 56, 60, 64, 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52,
- 57, 62, 67, 71, 75, 83, 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62,
- 69, 75, 79, 84, 91, 97, 100, 102, 104, 54, 50, 49, 49, 54, 60, 67,
- 75, 82, 87, 92, 100, 106, 110, 109, 112, 59, 55, 54, 53, 58, 64,
- 71, 79, 87, 92, 98, 106, 112, 117, 117, 121, 65, 61, 59, 58, 63,
- 68, 75, 84, 92, 98, 105, 114, 120, 125, 126, 130, 74, 69, 67, 65,
- 70, 75, 83, 91, 100, 106, 114, 123, 131, 135, 137, 140, 82, 76, 73,
- 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 148, 150, 87, 81,
- 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150, 155, 162,
- 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155, 162,
- 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
- 168, 174,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54,
- 56, 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31,
- 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53,
- 56, 61, 62, 68, 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32,
- 32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60,
- 61, 67, 69, 74, 76, 77, 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 49, 51, 54, 58, 59, 65,
- 68, 72, 74, 75, 79, 81, 84, 86, 88, 90, 31, 32, 32, 32, 33, 33, 33,
- 34, 35, 36, 36, 39, 40, 42, 44, 45, 49, 51, 54, 58, 59, 64, 67, 71,
- 73, 74, 78, 80, 83, 85, 88, 90, 31, 32, 32, 32, 33, 33, 34, 34, 35,
- 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
- 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38,
- 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69, 71, 72, 75, 78,
- 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43,
- 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78, 80, 82,
- 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50,
- 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
- 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59,
- 60, 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35,
- 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64,
- 67, 68, 73, 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37,
- 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73,
- 78, 80, 84, 86, 86, 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40,
- 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83,
- 86, 88, 89, 92, 93, 95, 97, 97, 98, 44, 42, 42, 41, 42, 42, 42, 44,
- 47, 52, 54, 58, 60, 63, 66, 67, 71, 73, 75, 79, 79, 84, 86, 90, 92,
- 92, 96, 98, 98, 98, 101, 104, 47, 45, 45, 44, 44, 45, 45, 47, 50,
- 55, 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97,
- 100, 99, 102, 105, 104, 104, 48, 46, 45, 44, 45, 45, 46, 47, 51,
- 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90, 93, 96, 98, 99,
- 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50, 49, 51, 54,
- 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 106,
- 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51, 53,
- 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
- 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53,
- 55, 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106,
- 110, 112, 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58,
- 58, 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103,
- 109, 112, 116, 118, 119, 122, 121, 125, 123, 125, 128, 65, 62, 61,
- 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79, 84, 85, 92, 94, 98,
- 103, 105, 111, 114, 118, 120, 121, 125, 129, 126, 129, 130, 129,
- 71, 68, 67, 65, 64, 64, 63, 65, 68, 72, 73, 78, 80, 84, 89, 90, 97,
- 100, 103, 109, 111, 117, 120, 125, 127, 128, 133, 130, 134, 133,
- 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83, 86,
- 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131, 131, 135, 137,
- 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74, 78, 79, 84,
- 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136, 137,
- 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
- 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136,
- 139, 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72,
- 73, 76, 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128,
- 131, 137, 139, 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79,
- 78, 78, 75, 77, 80, 83, 85, 90, 92, 96, 100, 102, 110, 112, 117,
- 122, 125, 133, 135, 142, 144, 145, 150, 151, 155, 158, 162, 158,
- 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98, 99, 106,
- 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156, 156,
- 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95,
- 98, 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152,
- 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83, 82, 85,
- 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
- 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88,
- 86, 86, 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125,
- 130, 133, 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100,
- 95, 95, 90, 90, 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111,
- 119, 119, 128, 129, 137, 137, 147, 148, 157, 158, 169, 170, 174,
- 175, 181,
- // Size 4x8
- 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51,
- 61, 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
- // Size 8x4
- 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59,
- 57, 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
- // Size 8x16
- 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31,
- 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35,
- 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49,
- 60, 67, 76, 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57,
- 71, 82, 92, 106, 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66,
- 79, 92, 105, 120, 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73,
- 79, 92, 105, 120, 139, 142, 87, 78, 83, 96, 110, 125, 144, 153, 92,
- 83, 84, 97, 114, 132, 150, 157, 97, 88, 86, 97, 111, 128, 147, 163,
- // Size 16x8
- 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31,
- 32, 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34,
- 35, 37, 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42,
- 42, 48, 54, 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49,
- 54, 60, 67, 75, 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59,
- 58, 63, 68, 76, 84, 92, 98, 105, 113, 120, 125, 132, 128, 82, 76,
- 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144, 150, 147, 90,
- 85, 81, 79, 81, 87, 93, 101, 108, 116, 124, 134, 142, 153, 157,
- 163,
- // Size 16x32
- 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31,
- 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32,
- 32, 32, 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32,
- 33, 34, 34, 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34,
- 35, 36, 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36,
- 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38,
- 42, 45, 49, 56, 58, 69, 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44,
- 47, 51, 58, 60, 71, 73, 76, 78, 80, 34, 34, 35, 37, 41, 42, 48, 50,
- 54, 61, 63, 73, 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59,
- 65, 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56, 60, 66,
- 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50, 58, 60, 65, 71, 73,
- 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51, 60, 62, 67, 74, 76, 86,
- 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92,
- 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98,
- 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96, 98, 104,
- 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
- 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106,
- 108, 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98,
- 110, 112, 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100,
- 103, 116, 118, 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92,
- 102, 105, 118, 120, 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89,
- 97, 108, 111, 125, 127, 129, 129, 130, 74, 68, 67, 66, 73, 75, 86,
- 91, 100, 110, 113, 128, 131, 135, 134, 130, 79, 72, 71, 70, 77, 79,
- 90, 95, 104, 115, 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79,
- 81, 92, 97, 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72,
- 79, 81, 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78,
- 76, 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89,
- 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161,
- 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153, 157,
- 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151, 154,
- 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
- 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144,
- 144, 163, 163, 173,
- // Size 32x16
- 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53,
- 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54,
- 58, 60, 65, 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32,
- 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59,
- 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34,
- 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66,
- 70, 72, 72, 76, 78, 80, 82, 85, 87, 35, 35, 34, 34, 35, 36, 37, 39,
- 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79,
- 79, 83, 83, 84, 85, 86, 87, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
- 48, 50, 51, 53, 56, 56, 60, 61, 63, 67, 68, 73, 75, 79, 81, 81, 85,
- 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
- 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97,
- 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
- 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99, 101, 104,
- 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71,
- 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
- 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74,
- 78, 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120,
- 118, 116, 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67,
- 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120,
- 121, 125, 128, 132, 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71,
- 73, 77, 78, 84, 86, 90, 95, 96, 103, 106, 110, 116, 118, 125, 128,
- 133, 136, 136, 141, 139, 135, 136, 140, 144, 82, 78, 76, 74, 73,
- 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118,
- 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 88, 83,
- 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97, 98, 104, 107, 111,
- 117, 119, 127, 129, 135, 140, 145, 148, 148, 153, 153, 154, 159,
- 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101,
- 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157,
- 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
- 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149,
- 150, 161, 162, 166, 167, 173,
- // Size 4x16
- 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34,
- 42, 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60,
- 90, 107, 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75,
- 81, 117, 145, 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
- // Size 16x4
- 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36,
- 35, 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58,
- 57, 56, 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88,
- 82, 79, 76, 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
- // Size 8x32
- 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31,
- 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33,
- 35, 42, 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37,
- 42, 49, 58, 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48,
- 54, 63, 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60,
- 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76,
- 88, 93, 44, 42, 51, 63, 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97,
- 101, 48, 45, 54, 67, 76, 85, 98, 101, 53, 50, 57, 71, 82, 92, 106,
- 108, 55, 51, 59, 72, 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112,
- 116, 63, 58, 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105,
- 120, 124, 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100,
- 113, 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92,
- 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78, 83,
- 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157, 92, 83,
- 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151, 163, 97,
- 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144, 163,
- // Size 32x8
- 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53,
- 55, 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32,
- 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54,
- 58, 59, 64, 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34,
- 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66,
- 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42,
- 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86,
- 90, 92, 92, 96, 97, 97, 97, 97, 97, 53, 51, 50, 49, 49, 50, 49, 51,
- 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104,
- 105, 106, 110, 113, 114, 112, 111, 110, 65, 62, 61, 59, 59, 59, 58,
- 60, 63, 67, 68, 73, 76, 79, 84, 85, 92, 94, 98, 103, 105, 111, 113,
- 118, 120, 121, 125, 128, 132, 130, 128, 126, 82, 78, 76, 74, 73,
- 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118,
- 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147, 144, 90, 86,
- 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101, 108, 110,
- 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163, 163,
- 163},
- {// Chroma
- // Size 4x4
- 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
- // Size 8x8
- 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46,
- 47, 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48,
- 54, 62, 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61,
- 68, 77, 86, 91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
- // Size 16x16
- 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31,
- 31, 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34,
- 37, 42, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42,
- 47, 47, 48, 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47,
- 48, 50, 49, 50, 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50,
- 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53,
- 55, 57, 59, 60, 61, 64, 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57,
- 61, 63, 64, 66, 69, 70, 72, 71, 71, 52, 49, 47, 47, 50, 54, 59, 63,
- 66, 68, 70, 73, 75, 77, 75, 75, 54, 51, 49, 48, 52, 55, 60, 64, 68,
- 71, 73, 76, 79, 80, 79, 79, 57, 53, 51, 50, 53, 56, 61, 66, 70, 73,
- 76, 80, 82, 84, 83, 84, 61, 57, 55, 53, 56, 59, 64, 69, 73, 76, 80,
- 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61, 66, 70, 75, 79, 82, 87,
- 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72, 77, 80, 84, 89, 93,
- 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79, 83, 88, 93, 96,
- 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88, 93, 97,
- 100, 102,
- // Size 32x32
- 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52,
- 53, 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31,
- 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52,
- 54, 54, 57, 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31,
- 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
- 56, 57, 59, 60, 60, 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35,
- 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55,
- 57, 58, 58, 60, 61, 62, 62, 63, 64, 33, 34, 34, 34, 37, 38, 42, 43,
- 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 55, 56, 57,
- 57, 59, 60, 61, 62, 63, 64, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
- 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 53, 54, 56, 57, 57, 59,
- 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
- 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54, 55, 55, 57, 58, 59,
- 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
- 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58, 58, 59, 60,
- 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
- 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60, 47,
- 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
- 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47,
- 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
- 56, 58, 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46,
- 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61,
- 62, 63, 64, 64, 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46,
- 47, 49, 52, 53, 55, 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65,
- 66, 66, 67, 67, 67, 68, 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49,
- 52, 53, 55, 57, 58, 59, 60, 61, 62, 62, 63, 63, 65, 66, 67, 68, 68,
- 69, 70, 69, 68, 69, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54,
- 56, 57, 59, 61, 61, 63, 64, 64, 66, 66, 68, 69, 70, 70, 71, 72, 70,
- 71, 72, 71, 70, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58,
- 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71, 71, 71, 73, 74, 73, 72,
- 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63,
- 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75, 75, 76, 75, 74,
- 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62, 64, 64, 67,
- 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78, 54, 52,
- 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69, 71,
- 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
- 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76,
- 78, 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51,
- 50, 51, 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80,
- 82, 82, 83, 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53,
- 55, 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86,
- 86, 88, 86, 87, 86, 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58,
- 59, 62, 64, 66, 69, 69, 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89,
- 89, 88, 88, 88, 86, 63, 60, 59, 57, 56, 56, 54, 55, 57, 60, 60, 63,
- 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90, 92, 91, 91,
- 90, 89, 91, 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68,
- 70, 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 91, 93, 93, 93, 92, 93,
- 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71,
- 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94, 95, 95, 93, 95, 67,
- 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69, 72, 73, 77, 78,
- 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95, 67, 64, 63,
- 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78, 80, 81,
- 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
- 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83,
- 87, 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62,
- 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86,
- 88, 90, 92, 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62,
- 61, 60, 60, 63, 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88,
- 89, 93, 93, 97, 98, 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62,
- 60, 60, 64, 64, 67, 67, 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91,
- 91, 95, 95, 100, 100, 101, 101, 104,
- // Size 4x8
- 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49,
- 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
- // Size 8x4
- 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53,
- 50, 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
- // Size 8x16
- 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33,
- 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44,
- 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51,
- 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61,
- 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70,
- 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83,
- 91, 91, 66, 59, 61, 69, 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94,
- 97, 70, 63, 61, 67, 75, 83, 92, 98,
- // Size 16x8
- 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33,
- 34, 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45,
- 46, 47, 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45,
- 45, 49, 53, 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47,
- 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53,
- 57, 61, 66, 71, 73, 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61,
- 66, 71, 75, 79, 83, 87, 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67,
- 71, 74, 79, 83, 87, 91, 95, 97, 98,
- // Size 16x32
- 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
- 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
- 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
- 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
- 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
- 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
- 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
- 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
- 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
- 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
- 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
- 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
- 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
- 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
- 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
- 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
- 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
- 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
- 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
- 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
- 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
- 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
- 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
- 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
- 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
- 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
- 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
- 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
- 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
- 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
- 98, 102,
- // Size 32x16
- 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52,
- 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31,
- 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50,
- 51, 52, 54, 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35,
- 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51,
- 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43,
- 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53,
- 55, 56, 56, 57, 58, 59, 59, 60, 61, 45, 45, 45, 44, 46, 46, 47, 48,
- 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60,
- 60, 61, 61, 61, 61, 61, 61, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
- 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58, 59, 60, 61, 61, 63,
- 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
- 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68,
- 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
- 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70, 71, 72, 73,
- 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
- 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 56,
- 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
- 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53,
- 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
- 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56,
- 56, 54, 55, 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85,
- 86, 89, 90, 90, 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55,
- 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
- 91, 91, 93, 93, 94, 94, 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60,
- 61, 63, 65, 66, 70, 70, 73, 75, 77, 80, 81, 85, 86, 89, 91, 93, 94,
- 94, 96, 96, 95, 97, 98, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63,
- 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
- 97, 99, 98, 98, 68, 65, 65, 62, 62, 60, 61, 59, 59, 62, 62, 65, 65,
- 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89, 89, 93, 93, 97, 98, 99,
- 99, 102,
- // Size 4x16
- 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43,
- 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54,
- 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81,
- 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
- // Size 16x4
- 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48,
- 46, 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52,
- 50, 49, 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60,
- 57, 60, 63, 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
- // Size 8x32
- 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31,
- 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37,
- 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47,
- 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49,
- 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54,
- 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61,
- 66, 67, 49, 45, 51, 58, 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71,
- 71, 50, 46, 52, 59, 64, 67, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
- 53, 48, 53, 61, 67, 72, 77, 75, 54, 49, 54, 62, 68, 73, 79, 79, 56,
- 51, 55, 63, 70, 76, 82, 80, 57, 51, 55, 64, 70, 76, 83, 83, 60, 54,
- 57, 65, 72, 79, 86, 85, 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59,
- 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68,
- 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78,
- 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85,
- 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90,
- 98,
- // Size 32x8
- 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52,
- 53, 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34,
- 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49,
- 51, 51, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44,
- 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55,
- 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46,
- 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66,
- 67, 68, 68, 69, 69, 68, 68, 67, 67, 52, 50, 49, 48, 47, 47, 47, 48,
- 50, 53, 54, 57, 59, 61, 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75,
- 75, 77, 78, 77, 76, 75, 74, 57, 54, 53, 52, 51, 51, 50, 51, 53, 56,
- 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79, 80, 82, 83, 83, 84,
- 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
- 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94,
- 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67,
- 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98,
- 98},
- },
- // Quantizer level 4.
- {
- {// Luma
- // Size 4x4
- 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
- // Size 8x8
- 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34,
- 35, 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48,
- 53, 65, 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76,
- 71, 74, 86, 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
- // Size 16x16
- 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31,
- 32, 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32,
- 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33,
- 34, 36, 37, 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36,
- 38, 42, 44, 46, 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42,
- 48, 50, 54, 57, 60, 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50,
- 54, 58, 61, 65, 69, 74, 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58,
- 63, 67, 71, 75, 80, 86, 91, 95, 97, 48, 46, 45, 45, 50, 57, 61, 67,
- 71, 76, 80, 86, 93, 98, 101, 104, 54, 51, 49, 49, 53, 60, 65, 71,
- 76, 82, 87, 93, 100, 105, 109, 112, 59, 56, 54, 53, 57, 64, 69, 75,
- 80, 87, 92, 99, 106, 112, 116, 120, 66, 63, 60, 59, 63, 69, 74, 80,
- 86, 93, 99, 107, 115, 121, 125, 129, 74, 70, 67, 66, 69, 75, 80,
- 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77, 73, 71, 74, 80,
- 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81, 77, 75, 78,
- 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86, 82, 80,
- 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48,
- 52, 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32,
- 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51,
- 56, 56, 62, 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32,
- 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56,
- 61, 63, 67, 70, 75, 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 34, 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60,
- 65, 68, 72, 74, 75, 78, 80, 82, 84, 31, 32, 32, 32, 32, 33, 33, 33,
- 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67,
- 72, 73, 74, 77, 80, 82, 84, 31, 32, 32, 32, 33, 33, 33, 34, 35, 35,
- 36, 36, 39, 40, 42, 44, 45, 48, 50, 53, 54, 59, 60, 64, 67, 71, 73,
- 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 37, 38,
- 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63, 66, 70, 71, 72, 75,
- 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37, 38, 38, 40, 41,
- 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72, 74, 77, 79,
- 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
- 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81, 34,
- 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
- 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35,
- 34, 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63,
- 64, 68, 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35,
- 36, 38, 38, 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68,
- 69, 73, 75, 79, 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40,
- 40, 44, 45, 50, 51, 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78,
- 80, 84, 85, 86, 89, 91, 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45,
- 46, 51, 52, 55, 56, 59, 61, 63, 65, 67, 70, 70, 75, 76, 80, 82, 86,
- 87, 88, 91, 92, 94, 96, 44, 42, 42, 41, 41, 42, 42, 42, 46, 47, 54,
- 54, 58, 59, 63, 65, 67, 70, 71, 75, 75, 79, 80, 84, 86, 90, 91, 92,
- 95, 97, 97, 97, 46, 44, 44, 42, 43, 44, 44, 44, 48, 49, 55, 55, 59,
- 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87, 89, 93, 94, 95, 98, 98,
- 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50, 51, 57, 57, 61, 63,
- 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98, 99, 101, 104,
- 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65,
- 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
- 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65,
- 67, 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106,
- 109, 112, 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64,
- 68, 70, 75, 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111,
- 112, 115, 114, 115, 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58,
- 64, 64, 69, 70, 75, 78, 80, 85, 87, 91, 92, 98, 99, 103, 106, 110,
- 112, 113, 116, 119, 120, 119, 65, 62, 61, 59, 59, 59, 58, 58, 62,
- 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 114,
- 118, 120, 121, 124, 123, 123, 126, 66, 63, 63, 60, 60, 60, 59, 59,
- 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112,
- 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67, 65, 65, 64, 63,
- 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103, 103, 111, 112,
- 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70, 68, 67,
- 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106,
- 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
- 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104,
- 110, 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144,
- 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98,
- 103, 105, 111, 112, 120, 121, 127, 130, 136, 137, 139, 142, 145,
- 148, 144, 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88,
- 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
- 144, 148, 150, 155, 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84,
- 89, 91, 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140,
- 142, 144, 147, 149, 153, 155, 89, 84, 84, 80, 80, 79, 78, 77, 79,
- 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132,
- 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86, 82, 82, 81, 80,
- 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123,
- 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88, 84,
- 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110,
- 118, 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
- // Size 4x8
- 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48,
- 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
- // Size 8x4
- 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51,
- 50, 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
- // Size 8x16
- 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31,
- 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34,
- 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44,
- 54, 63, 73, 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62,
- 74, 85, 96, 103, 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70,
- 84, 98, 110, 118, 66, 60, 63, 75, 90, 106, 119, 126, 74, 67, 69,
- 81, 97, 113, 128, 134, 81, 73, 75, 86, 102, 120, 135, 143, 86, 78,
- 78, 90, 106, 124, 140, 147, 91, 82, 80, 90, 103, 119, 137, 151,
- // Size 16x8
- 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31,
- 32, 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33,
- 34, 36, 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38,
- 40, 44, 51, 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48,
- 52, 58, 63, 69, 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58,
- 62, 68, 73, 79, 85, 92, 98, 106, 113, 120, 124, 119, 79, 74, 71,
- 69, 72, 78, 84, 90, 96, 103, 110, 119, 128, 135, 140, 137, 87, 82,
- 79, 77, 78, 84, 89, 96, 103, 111, 118, 126, 134, 143, 147, 151,
- // Size 16x32
- 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31,
- 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32,
- 32, 32, 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32,
- 33, 33, 34, 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33,
- 34, 35, 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35,
- 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37,
- 40, 42, 48, 49, 58, 59, 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41,
- 42, 48, 49, 58, 59, 69, 70, 75, 77, 33, 33, 34, 36, 38, 41, 44, 46,
- 52, 53, 62, 63, 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53,
- 54, 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54, 58, 60,
- 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48, 51, 54, 59, 60, 68,
- 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50, 54, 58, 63, 65, 73, 74,
- 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85,
- 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91,
- 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83, 93, 94, 97,
- 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98, 103,
- 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
- 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111,
- 107, 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113,
- 115, 58, 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118,
- 115, 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119,
- 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119,
- 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97, 111, 112,
- 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86, 97, 100, 113,
- 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79, 85, 90, 101, 104,
- 118, 119, 133, 135, 141, 140, 81, 74, 73, 71, 75, 80, 86, 91, 102,
- 105, 120, 121, 135, 137, 143, 140, 82, 75, 74, 72, 75, 81, 87, 92,
- 103, 106, 121, 122, 136, 139, 147, 151, 86, 78, 78, 75, 78, 84, 90,
- 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81, 80, 77, 80, 86,
- 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83, 82, 79, 80,
- 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85, 85,
- 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
- // Size 32x16
- 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48,
- 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49,
- 53, 54, 60, 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32,
- 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54,
- 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34,
- 35, 35, 36, 37, 38, 38, 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59,
- 63, 66, 70, 71, 72, 75, 77, 79, 81, 33, 33, 33, 33, 34, 35, 36, 36,
- 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69,
- 73, 75, 75, 78, 80, 80, 81, 36, 35, 35, 34, 35, 36, 37, 38, 41, 42,
- 48, 48, 50, 51, 53, 55, 56, 59, 60, 63, 63, 68, 69, 73, 75, 79, 80,
- 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
- 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90,
- 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48, 54, 54, 58, 59,
- 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92, 95, 98,
- 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65,
- 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106,
- 105, 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65,
- 67, 71, 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106,
- 109, 112, 114, 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68,
- 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111, 113, 118, 120,
- 121, 124, 122, 119, 117, 66, 63, 62, 60, 60, 60, 59, 59, 63, 64,
- 69, 69, 74, 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 112, 115,
- 119, 121, 122, 125, 127, 130, 134, 79, 75, 74, 72, 71, 71, 69, 69,
- 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118, 119,
- 125, 128, 133, 135, 136, 140, 140, 137, 134, 81, 77, 76, 74, 73,
- 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111, 112,
- 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151, 87, 83, 82,
- 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111,
- 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152,
- 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
- 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151,
- 155, 155, 160,
- // Size 4x16
- 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33,
- 41, 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56,
- 76, 98, 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75,
- 100, 130, 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
- // Size 16x4
- 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36,
- 35, 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51,
- 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76,
- 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
- // Size 8x32
- 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31,
- 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32,
- 34, 38, 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36,
- 40, 48, 58, 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44,
- 52, 62, 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58,
- 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73,
- 84, 89, 40, 39, 45, 56, 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90,
- 96, 46, 43, 48, 60, 72, 82, 93, 97, 48, 45, 50, 62, 74, 85, 96,
- 103, 52, 48, 52, 65, 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103,
- 111, 58, 53, 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110,
- 118, 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106,
- 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81, 97,
- 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73, 75, 86,
- 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86, 78, 78,
- 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91, 82,
- 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152,
- // Size 32x8
- 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48,
- 52, 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32,
- 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49,
- 53, 54, 59, 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33,
- 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57,
- 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39,
- 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75,
- 79, 81, 85, 86, 87, 90, 90, 90, 90, 51, 49, 49, 47, 47, 48, 48, 48,
- 52, 53, 58, 59, 63, 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97,
- 101, 102, 103, 106, 105, 103, 103, 65, 62, 61, 59, 59, 59, 58, 58,
- 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98, 105, 106, 111,
- 113, 118, 120, 121, 124, 122, 119, 117, 79, 75, 74, 72, 71, 71, 69,
- 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101, 103, 109, 110, 118,
- 119, 125, 128, 133, 135, 136, 140, 140, 137, 134, 87, 83, 82, 79,
- 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103, 105, 111, 113,
- 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151, 152},
- {// Chroma
- // Size 4x4
- 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
- // Size 8x8
- 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42,
- 45, 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47,
- 50, 58, 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57,
- 65, 73, 82, 89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
- // Size 16x16
- 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31,
- 31, 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32,
- 34, 39, 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39,
- 44, 46, 47, 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46,
- 48, 50, 49, 48, 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50,
- 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53,
- 54, 55, 56, 57, 58, 60, 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55,
- 58, 60, 61, 62, 64, 66, 68, 69, 69, 50, 48, 46, 46, 49, 54, 56, 60,
- 61, 63, 65, 67, 69, 71, 72, 72, 52, 49, 47, 47, 49, 54, 57, 61, 63,
- 66, 68, 71, 73, 75, 76, 77, 54, 51, 49, 48, 51, 55, 58, 62, 65, 68,
- 71, 74, 76, 78, 80, 81, 57, 54, 52, 51, 53, 57, 60, 64, 67, 71, 74,
- 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59, 62, 66, 69, 73, 76, 80,
- 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68, 71, 75, 78, 83, 87,
- 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76, 80, 84, 89, 92,
- 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85, 89, 94, 96,
- 98,
- // Size 32x32
- 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50,
- 52, 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31,
- 31, 31, 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50,
- 52, 52, 54, 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31,
- 32, 35, 37, 39, 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51,
- 54, 54, 56, 57, 59, 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35,
- 38, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52,
- 54, 55, 57, 58, 58, 60, 61, 61, 62, 31, 32, 32, 33, 34, 37, 39, 41,
- 43, 43, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 54, 55,
- 57, 57, 58, 59, 60, 61, 62, 33, 34, 35, 35, 37, 39, 41, 43, 44, 45,
- 47, 47, 46, 46, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 54, 56, 57,
- 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41, 44, 46, 46, 46, 47, 47,
- 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52, 53, 55, 56, 56, 57,
- 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47, 48, 47, 46, 46,
- 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55, 56, 57, 58,
- 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49, 48, 49,
- 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59, 41,
- 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
- 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47,
- 46, 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
- 55, 56, 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46,
- 47, 47, 47, 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57,
- 57, 59, 59, 61, 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46,
- 46, 49, 49, 53, 53, 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61,
- 62, 63, 64, 64, 65, 66, 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49,
- 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 60, 61, 61, 63, 63, 65,
- 65, 65, 66, 66, 67, 68, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53,
- 54, 55, 56, 58, 59, 60, 61, 61, 62, 62, 63, 64, 65, 66, 67, 68, 68,
- 69, 70, 69, 68, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56,
- 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67, 68, 69, 69, 70, 70, 70,
- 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60,
- 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71, 72, 73, 72, 71,
- 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
- 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75, 52, 50,
- 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65, 66,
- 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
- 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70,
- 73, 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49,
- 48, 48, 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74,
- 75, 76, 78, 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50,
- 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80,
- 82, 82, 83, 84, 83, 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53,
- 57, 57, 60, 61, 64, 65, 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83,
- 83, 84, 85, 85, 83, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59,
- 61, 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 83, 85, 86, 86, 87,
- 87, 86, 87, 61, 58, 57, 55, 55, 54, 53, 53, 55, 56, 59, 59, 62, 63,
- 66, 68, 69, 72, 73, 76, 76, 80, 80, 83, 84, 86, 87, 88, 89, 89, 89,
- 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67, 69,
- 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90, 91, 92, 90, 91, 64,
- 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74,
- 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91, 65, 61, 61,
- 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74, 75, 78,
- 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60, 59,
- 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
- 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58,
- 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
- 89, 92, 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59,
- 61, 62, 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90,
- 94, 94, 96, 96, 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62,
- 65, 65, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96,
- 96, 97, 97, 99,
- // Size 4x8
- 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47,
- 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
- // Size 8x4
- 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50,
- 47, 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
- // Size 8x16
- 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31,
- 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43,
- 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49,
- 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57,
- 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67,
- 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80,
- 86, 88, 64, 58, 58, 65, 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91,
- 94, 68, 61, 59, 65, 72, 81, 89, 95,
- // Size 16x8
- 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32,
- 33, 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41,
- 43, 46, 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46,
- 46, 49, 53, 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47,
- 49, 54, 57, 61, 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52,
- 57, 60, 64, 67, 71, 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60,
- 64, 67, 71, 75, 78, 82, 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65,
- 69, 73, 77, 81, 85, 88, 92, 94, 95,
- // Size 16x32
- 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
- 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
- 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
- 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
- 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
- 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
- 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
- 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
- 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
- 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
- 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
- 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
- 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
- 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
- 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
- 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
- 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
- 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
- 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
- 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
- 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
- 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
- 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
- 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
- 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
- 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
- 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
- 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
- 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
- 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
- 95, 98,
- // Size 32x16
- 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50,
- 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31,
- 31, 32, 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48,
- 50, 50, 52, 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33,
- 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50,
- 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43,
- 45, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51,
- 52, 53, 55, 55, 56, 57, 58, 58, 59, 40, 41, 41, 42, 43, 44, 46, 47,
- 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56,
- 57, 58, 58, 59, 59, 59, 59, 48, 47, 47, 46, 46, 47, 47, 47, 49, 50,
- 53, 53, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 57, 58, 59, 60, 61,
- 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
- 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66,
- 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49, 53, 54, 56, 56,
- 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68, 69, 70, 71,
- 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
- 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 52,
- 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
- 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54,
- 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
- 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52,
- 52, 51, 50, 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77,
- 77, 79, 81, 82, 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55,
- 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
- 86, 89, 89, 90, 91, 91, 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57,
- 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 90,
- 90, 91, 92, 93, 94, 95, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62,
- 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
- 94, 96, 95, 95, 67, 64, 64, 61, 61, 60, 60, 58, 58, 61, 61, 64, 64,
- 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86, 86, 90, 90, 95, 95, 96,
- 96, 98,
- // Size 4x16
- 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42,
- 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54,
- 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73,
- 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
- // Size 16x4
- 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48,
- 47, 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49,
- 48, 47, 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57,
- 56, 57, 61, 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
- // Size 8x32
- 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31,
- 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35,
- 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46,
- 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49,
- 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54,
- 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60,
- 64, 65, 48, 46, 49, 55, 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67,
- 69, 49, 46, 49, 57, 62, 65, 69, 70, 50, 46, 49, 57, 63, 67, 71, 73,
- 51, 47, 49, 58, 64, 69, 73, 74, 52, 48, 50, 58, 65, 71, 75, 77, 54,
- 49, 51, 59, 67, 73, 77, 78, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
- 52, 60, 69, 76, 82, 83, 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55,
- 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64,
- 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74,
- 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82,
- 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87,
- 95,
- // Size 32x8
- 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50,
- 51, 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33,
- 33, 33, 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48,
- 49, 50, 52, 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42,
- 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51,
- 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46,
- 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61,
- 62, 63, 64, 65, 65, 66, 66, 65, 65, 51, 49, 49, 47, 47, 47, 47, 46,
- 49, 50, 54, 54, 57, 58, 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72,
- 73, 73, 74, 75, 74, 72, 71, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53,
- 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76, 77, 79, 80, 82, 82,
- 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
- 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91,
- 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66,
- 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95,
- 95},
- },
- // Quantizer level 5.
- {
- {// Luma
- // Size 4x4
- 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
- // Size 8x8
- 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34,
- 35, 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45,
- 51, 61, 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64,
- 68, 78, 90, 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
- // Size 16x16
- 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31,
- 32, 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32,
- 32, 32, 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32,
- 33, 34, 35, 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34,
- 35, 37, 38, 40, 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37,
- 39, 42, 45, 47, 51, 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42,
- 48, 50, 54, 57, 60, 64, 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50,
- 54, 58, 61, 65, 69, 73, 78, 84, 86, 44, 42, 41, 42, 42, 47, 54, 58,
- 63, 67, 71, 75, 79, 84, 90, 92, 48, 46, 44, 45, 46, 51, 57, 61, 67,
- 71, 76, 80, 85, 90, 96, 99, 54, 51, 49, 50, 49, 54, 60, 65, 71, 76,
- 82, 87, 92, 97, 104, 106, 59, 56, 54, 54, 53, 58, 64, 69, 75, 80,
- 87, 92, 98, 103, 110, 113, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85,
- 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63, 68, 73, 78, 84, 90,
- 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74, 79, 84, 90,
- 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76, 81, 86,
- 92, 99, 106, 113, 121, 128, 137, 140,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
- 48, 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46,
- 51, 51, 56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51,
- 56, 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54,
- 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 34, 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59,
- 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
- 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64,
- 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
- 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59, 59, 64, 64, 71, 71,
- 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
- 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72,
- 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
- 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75, 34,
- 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
- 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34,
- 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54,
- 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34,
- 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64,
- 64, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36,
- 38, 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68,
- 68, 73, 73, 79, 79, 81, 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40,
- 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78,
- 78, 84, 84, 86, 86, 90, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45,
- 50, 50, 54, 54, 58, 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84,
- 84, 86, 86, 90, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54,
- 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92,
- 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58,
- 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
- 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67,
- 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 48,
- 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
- 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51,
- 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76,
- 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51,
- 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76,
- 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 59, 56,
- 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80,
- 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 59, 56,
- 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80,
- 87, 87, 92, 92, 98, 98, 103, 103, 110, 110, 113, 113, 116, 65, 62,
- 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85,
- 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124, 65,
- 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
- 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
- 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
- 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128,
- 132, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78,
- 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125,
- 128, 128, 132, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79,
- 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125,
- 134, 134, 137, 137, 141, 80, 76, 76, 72, 72, 71, 71, 69, 69, 74,
- 74, 79, 79, 84, 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118,
- 125, 125, 134, 134, 137, 137, 141, 83, 78, 78, 75, 75, 74, 74, 72,
- 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113,
- 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78, 75, 75,
- 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106, 106,
- 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
- 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102,
- 109, 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
- // Size 4x8
- 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45,
- 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
- // Size 8x4
- 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51,
- 50, 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
- // Size 8x16
- 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31,
- 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33,
- 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38,
- 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53,
- 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71,
- 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79,
- 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90,
- 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
- // Size 16x8
- 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31,
- 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32,
- 33, 34, 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34,
- 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42,
- 42, 48, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49,
- 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58,
- 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 79, 75, 72, 71, 69,
- 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
- // Size 16x32
- 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31,
- 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32,
- 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32,
- 33, 33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33,
- 33, 34, 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34,
- 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36,
- 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38,
- 42, 42, 49, 49, 58, 58, 69, 69, 75, 32, 33, 33, 35, 35, 38, 38, 42,
- 42, 49, 49, 58, 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48,
- 54, 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54,
- 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60,
- 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48, 48, 54, 54, 60, 60, 68,
- 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73,
- 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84,
- 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90,
- 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90, 90, 95,
- 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
- 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
- 53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
- 53, 49, 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
- 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116,
- 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116,
- 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118, 118,
- 124, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92, 92, 105, 105, 118,
- 118, 124, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111, 111,
- 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84, 97, 97, 111,
- 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104,
- 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90, 104,
- 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
- 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92,
- 92, 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84,
- 96, 96, 109, 109, 124, 124, 141, 141, 149,
- // Size 32x16
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
- 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45,
- 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49,
- 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34,
- 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54,
- 58, 58, 63, 63, 70, 70, 72, 72, 76, 32, 32, 32, 33, 33, 34, 34, 35,
- 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58,
- 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
- 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73,
- 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
- 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79,
- 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
- 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92,
- 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
- 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53,
- 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
- 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53,
- 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
- 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65,
- 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
- 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121, 124,
- 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79,
- 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
- 124, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78, 84, 84,
- 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125, 133, 133,
- 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69, 73, 73, 78, 78,
- 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118, 125, 125,
- 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75, 79,
- 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
- 132, 132, 141, 141, 144, 144, 149,
- // Size 4x16
- 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33,
- 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53,
- 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68,
- 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
- // Size 16x4
- 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36,
- 35, 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51,
- 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75,
- 72, 71, 69, 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
- // Size 8x32
- 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31,
- 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32,
- 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34,
- 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38,
- 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48,
- 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60,
- 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73,
- 84, 44, 41, 43, 53, 63, 71, 79, 90, 44, 41, 43, 53, 63, 71, 79, 90,
- 48, 45, 46, 56, 67, 76, 85, 96, 48, 45, 46, 56, 67, 76, 85, 96, 53,
- 49, 50, 60, 71, 82, 92, 103, 53, 49, 50, 60, 71, 82, 92, 103, 58,
- 54, 54, 63, 75, 87, 98, 110, 58, 54, 54, 63, 75, 87, 98, 110, 65,
- 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68, 79, 92, 105, 118, 71,
- 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84, 97, 111, 125, 79,
- 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90, 104, 118, 133,
- 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92, 106, 121,
- 136, 87, 79, 76, 84, 96, 109, 124, 141,
- // Size 32x8
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44,
- 48, 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45,
- 49, 49, 54, 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33,
- 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50,
- 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36,
- 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63,
- 68, 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42,
- 42, 48, 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79,
- 84, 84, 90, 90, 92, 92, 96, 53, 51, 51, 49, 49, 50, 50, 49, 49, 54,
- 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92, 97, 97,
- 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63,
- 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111,
- 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
- 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
- 118, 125, 125, 133, 133, 136, 136, 141},
- {// Chroma
- // Size 4x4
- 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
- // Size 8x8
- 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42,
- 45, 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46,
- 50, 56, 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55,
- 61, 68, 75, 82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
- // Size 16x16
- 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31,
- 31, 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31,
- 32, 35, 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35,
- 39, 43, 45, 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43,
- 47, 47, 48, 46, 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47,
- 48, 50, 49, 49, 50, 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50,
- 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53,
- 54, 55, 56, 57, 58, 60, 61, 63, 64, 49, 47, 45, 45, 45, 49, 53, 55,
- 58, 60, 61, 62, 63, 65, 67, 68, 50, 48, 46, 46, 46, 50, 54, 56, 60,
- 61, 63, 65, 67, 68, 71, 71, 52, 50, 47, 47, 47, 50, 54, 57, 61, 63,
- 66, 68, 70, 72, 75, 75, 54, 52, 49, 49, 48, 52, 55, 58, 62, 65, 68,
- 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53, 56, 60, 63, 67, 70, 73,
- 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61, 65, 68, 72, 75, 79,
- 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71, 75, 78, 82, 85,
- 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79, 83, 86, 90,
- 91,
- // Size 32x32
- 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49,
- 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31,
- 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48,
- 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31,
- 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50,
- 52, 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35,
- 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49,
- 52, 52, 54, 54, 57, 57, 58, 58, 60, 30, 31, 31, 32, 32, 35, 35, 40,
- 40, 42, 42, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52,
- 54, 54, 57, 57, 58, 58, 60, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
- 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53,
- 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
- 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51, 51, 53, 53, 56, 56,
- 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
- 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55,
- 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
- 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57, 41,
- 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
- 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42,
- 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50,
- 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46,
- 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
- 55, 56, 56, 58, 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47,
- 48, 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
- 56, 58, 58, 60, 60, 61, 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46,
- 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61,
- 61, 63, 63, 64, 64, 66, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
- 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63,
- 63, 64, 64, 66, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53,
- 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68,
- 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55,
- 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67, 67, 68, 68, 69,
- 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
- 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 50, 48,
- 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61, 61,
- 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
- 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66,
- 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47,
- 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68,
- 70, 70, 72, 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48,
- 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73,
- 75, 75, 78, 78, 79, 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52,
- 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75,
- 78, 78, 79, 79, 80, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56,
- 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82,
- 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60,
- 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83,
- 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65,
- 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 60,
- 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68,
- 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88, 63, 60, 60,
- 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75,
- 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57,
- 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
- 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57,
- 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
- 83, 86, 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55,
- 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
- 86, 90, 90, 91, 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60,
- 62, 62, 66, 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92,
- 92, 93, 93, 95,
- // Size 4x8
- 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46,
- 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
- // Size 8x4
- 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50,
- 47, 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
- // Size 8x16
- 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30,
- 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40,
- 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48,
- 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53,
- 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61,
- 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70,
- 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82,
- 89, 64, 59, 56, 61, 68, 75, 83, 90,
- // Size 16x8
- 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31,
- 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38,
- 40, 43, 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46,
- 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46,
- 45, 49, 53, 56, 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47,
- 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53,
- 57, 60, 64, 67, 71, 73, 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60,
- 64, 67, 71, 75, 78, 82, 85, 89, 90,
- // Size 16x32
- 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
- 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
- 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
- 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
- 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
- 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
- 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
- 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
- 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
- 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
- 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
- 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
- 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
- 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
- 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
- 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
- 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
- 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
- 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
- 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
- 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
- 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
- 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
- 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
- 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
- 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
- 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
- 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
- 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
- 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
- 92, 95,
- // Size 32x16
- 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49,
- 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31,
- 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46,
- 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32,
- 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48,
- 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43,
- 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49,
- 50, 50, 52, 52, 55, 55, 56, 56, 57, 37, 38, 38, 40, 40, 43, 43, 47,
- 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50,
- 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
- 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58,
- 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
- 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60,
- 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
- 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68,
- 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
- 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52,
- 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
- 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50,
- 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
- 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52,
- 51, 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73,
- 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51,
- 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
- 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54,
- 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85,
- 85, 89, 89, 90, 90, 92, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57,
- 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
- 89, 90, 90, 92, 66, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62,
- 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84, 84, 88, 88, 92, 92, 93,
- 93, 95,
- // Size 4x16
- 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40,
- 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53,
- 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70,
- 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
- // Size 16x4
- 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48,
- 47, 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50,
- 48, 47, 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57,
- 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
- // Size 8x32
- 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31,
- 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32,
- 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43,
- 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47,
- 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49,
- 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54,
- 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60,
- 64, 49, 45, 46, 53, 58, 61, 64, 67, 49, 45, 46, 53, 58, 61, 64, 67,
- 50, 46, 46, 54, 59, 64, 67, 71, 50, 46, 46, 54, 59, 64, 67, 71, 52,
- 48, 47, 54, 61, 66, 71, 75, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
- 49, 55, 62, 68, 73, 78, 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50,
- 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
- 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67,
- 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
- 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84,
- 92,
- // Size 32x8
- 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49,
- 50, 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31,
- 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46,
- 48, 48, 50, 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40,
- 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47,
- 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47,
- 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55,
- 56, 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45,
- 45, 49, 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64,
- 65, 65, 67, 67, 68, 68, 69, 52, 50, 50, 48, 48, 47, 47, 47, 47, 50,
- 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68, 68, 70, 70, 72, 72,
- 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
- 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82,
- 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64,
- 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90,
- 92},
- },
- // Quantizer level 6.
- {
- {// Luma
- // Size 4x4
- 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
- // Size 8x8
- 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32,
- 33, 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41,
- 43, 54, 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56,
- 66, 77, 89, 98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
- // Size 16x16
- 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31,
- 32, 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32,
- 32, 32, 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32,
- 33, 34, 35, 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34,
- 35, 37, 38, 39, 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37,
- 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42,
- 46, 48, 50, 53, 55, 59, 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48,
- 51, 54, 57, 59, 63, 67, 71, 76, 82, 41, 40, 38, 40, 41, 46, 50, 54,
- 57, 60, 63, 67, 71, 75, 80, 86, 45, 43, 41, 42, 43, 48, 53, 57, 60,
- 65, 68, 72, 76, 81, 85, 91, 48, 46, 44, 45, 46, 51, 55, 59, 63, 68,
- 71, 76, 80, 85, 90, 96, 54, 51, 49, 50, 49, 54, 59, 63, 67, 72, 76,
- 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58, 62, 67, 71, 76, 80,
- 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67, 71, 75, 81, 85,
- 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76, 80, 85, 90,
- 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86, 91, 96,
- 104, 110, 118, 125, 134,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
- 44, 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43,
- 46, 46, 51, 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46,
- 51, 51, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50,
- 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 34, 34, 36, 37, 38, 41, 41, 44, 44, 49, 49, 52, 54,
- 56, 59, 61, 65, 65, 72, 72, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 34, 34, 35, 35, 37, 38, 39, 41, 42, 45, 45, 49, 49, 52, 54, 56, 59,
- 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35,
- 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52, 54, 56, 59, 60, 64,
- 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37,
- 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60, 64, 64, 70,
- 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38, 39, 40,
- 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69, 32,
- 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
- 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34,
- 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51,
- 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33,
- 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
- 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36,
- 36, 38, 38, 42, 42, 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61,
- 62, 64, 67, 68, 72, 72, 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38,
- 38, 42, 42, 47, 48, 50, 50, 52, 54, 54, 57, 57, 59, 60, 62, 64, 66,
- 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44,
- 44, 48, 50, 51, 52, 54, 56, 57, 59, 59, 62, 63, 65, 67, 69, 71, 72,
- 76, 76, 82, 82, 39, 38, 38, 38, 37, 38, 39, 39, 40, 41, 45, 45, 49,
- 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67, 69, 71, 73, 74, 78, 78,
- 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54,
- 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77, 80, 81, 86, 86,
- 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56, 58, 60,
- 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 45, 43,
- 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64, 65,
- 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
- 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71,
- 75, 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45,
- 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76,
- 79, 80, 83, 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49,
- 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86,
- 89, 91, 93, 97, 97, 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49,
- 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89,
- 92, 94, 97, 98, 104, 104, 57, 55, 55, 53, 52, 52, 52, 52, 52, 52,
- 57, 57, 61, 62, 65, 67, 70, 74, 75, 79, 79, 85, 85, 89, 90, 93, 96,
- 98, 102, 102, 108, 108, 59, 56, 56, 54, 54, 54, 54, 54, 53, 54, 58,
- 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86, 87, 90, 92, 95, 98, 99,
- 103, 104, 110, 110, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60, 60,
- 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95, 98, 101, 103,
- 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
- 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106,
- 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64,
- 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
- 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
- 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111,
- 113, 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64,
- 68, 68, 72, 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108,
- 111, 113, 118, 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69,
- 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110,
- 114, 118, 120, 125, 126, 134, 134, 80, 76, 76, 73, 72, 72, 71, 70,
- 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108,
- 110, 114, 118, 120, 125, 126, 134, 134,
- // Size 4x8
- 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41,
- 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
- // Size 8x4
- 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43,
- 42, 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
- // Size 8x16
- 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31,
- 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33,
- 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38,
- 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51,
- 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67,
- 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87,
- 95, 105, 65, 60, 58, 68, 79, 92, 102, 112, 71, 65, 63, 73, 84, 97,
- 108, 119, 79, 72, 70, 79, 90, 104, 115, 127,
- // Size 16x8
- 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31,
- 32, 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32,
- 33, 34, 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34,
- 36, 38, 42, 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42,
- 42, 48, 52, 56, 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49,
- 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56,
- 61, 65, 69, 74, 79, 83, 90, 95, 102, 108, 115, 73, 69, 66, 65, 64,
- 69, 73, 77, 81, 86, 91, 99, 105, 112, 119, 127,
- // Size 16x32
- 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31,
- 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32,
- 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32,
- 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32,
- 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33,
- 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35,
- 36, 38, 42, 43, 50, 50, 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37,
- 38, 42, 43, 49, 49, 56, 59, 65, 70, 32, 32, 33, 34, 35, 37, 38, 39,
- 42, 43, 49, 49, 56, 58, 64, 69, 32, 33, 33, 34, 35, 37, 39, 40, 43,
- 44, 50, 50, 56, 58, 64, 69, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48,
- 54, 54, 61, 63, 69, 73, 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54,
- 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59,
- 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66,
- 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71,
- 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79,
- 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86,
- 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90, 44,
- 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
- 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45,
- 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49,
- 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50,
- 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52,
- 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108, 58, 54, 54, 54,
- 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110, 61, 57, 57, 56,
- 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114, 65, 60, 60, 59,
- 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118, 67, 62, 61, 60,
- 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120, 71, 66, 65, 64,
- 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125, 72, 66, 66, 64,
- 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125, 79, 73, 72, 71,
- 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133, 79, 73, 72,
- 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
- 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42,
- 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45,
- 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33,
- 33, 34, 34, 34, 36, 36, 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50,
- 52, 54, 56, 59, 60, 64, 64, 71, 71, 32, 32, 32, 32, 33, 33, 34, 34,
- 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54,
- 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33, 33, 34, 35, 35, 37, 37,
- 39, 39, 42, 43, 44, 45, 46, 48, 48, 51, 51, 54, 54, 57, 58, 60, 63,
- 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
- 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73,
- 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 49,
- 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82,
- 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
- 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 44,
- 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
- 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51,
- 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
- 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50,
- 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76,
- 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
- 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89,
- 90, 94, 95, 98, 102, 103, 108, 108, 115, 115, 65, 62, 62, 60, 59,
- 59, 59, 59, 58, 58, 63, 63, 67, 68, 71, 73, 76, 79, 81, 85, 85, 91,
- 92, 96, 98, 101, 105, 106, 111, 111, 118, 118, 73, 70, 69, 67, 66,
- 66, 65, 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98,
- 99, 103, 105, 108, 112, 114, 119, 119, 127, 127, 79, 75, 75, 73,
- 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81, 84, 86, 90, 91, 96, 96,
- 103, 103, 108, 110, 114, 118, 120, 125, 125, 133, 133,
- // Size 4x16
- 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32,
- 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46,
- 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76,
- 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
- // Size 16x4
- 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34,
- 34, 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43,
- 41, 43, 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59,
- 59, 58, 63, 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
- // Size 8x32
- 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31,
- 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32,
- 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34,
- 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38,
- 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48,
- 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
- 65, 73, 36, 34, 38, 48, 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69,
- 77, 39, 37, 40, 50, 58, 65, 71, 79, 41, 39, 41, 51, 60, 67, 74, 81,
- 44, 41, 43, 53, 63, 71, 78, 85, 44, 42, 43, 54, 64, 72, 79, 86, 48,
- 45, 46, 56, 67, 76, 83, 91, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
- 49, 59, 71, 81, 89, 98, 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52,
- 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56,
- 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60,
- 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64,
- 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79, 72,
- 70, 79, 90, 104, 115, 127,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41,
- 44, 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42,
- 45, 45, 49, 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32,
- 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46,
- 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35,
- 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60,
- 62, 63, 66, 68, 69, 73, 73, 79, 79, 44, 42, 42, 41, 41, 41, 42, 42,
- 42, 43, 48, 48, 52, 54, 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75,
- 77, 79, 81, 84, 85, 90, 90, 53, 51, 51, 50, 49, 49, 50, 49, 49, 50,
- 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92,
- 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56, 61,
- 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
- 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64,
- 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108,
- 112, 114, 119, 119, 127, 127},
- {// Chroma
- // Size 4x4
- 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
- // Size 8x8
- 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38,
- 42, 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45,
- 45, 53, 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49,
- 56, 63, 69, 73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
- // Size 16x16
- 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31,
- 31, 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31,
- 32, 35, 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35,
- 39, 43, 45, 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43,
- 47, 47, 47, 47, 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47,
- 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50,
- 52, 52, 52, 52, 53, 53, 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52,
- 53, 54, 55, 55, 56, 57, 58, 60, 62, 49, 47, 45, 46, 46, 49, 52, 54,
- 55, 57, 58, 59, 60, 61, 63, 65, 49, 47, 45, 45, 45, 49, 52, 55, 57,
- 59, 60, 61, 63, 64, 66, 68, 50, 48, 46, 46, 46, 50, 53, 55, 58, 60,
- 61, 63, 65, 67, 68, 71, 52, 50, 47, 47, 47, 50, 53, 56, 59, 61, 63,
- 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52, 55, 57, 60, 63, 65, 68,
- 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58, 61, 64, 67, 70, 73,
- 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66, 68, 72, 75, 79,
- 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75, 78, 82, 85,
- 89,
- // Size 32x32
- 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49,
- 49, 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31,
- 31, 31, 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47,
- 48, 48, 50, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31,
- 31, 33, 34, 35, 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48,
- 49, 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33,
- 35, 36, 39, 40, 42, 42, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48,
- 50, 50, 51, 52, 53, 55, 55, 58, 58, 30, 31, 31, 31, 32, 33, 35, 36,
- 40, 40, 42, 42, 45, 46, 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49,
- 51, 52, 52, 54, 54, 57, 57, 32, 32, 33, 33, 33, 35, 37, 38, 41, 42,
- 43, 43, 46, 47, 46, 46, 45, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51,
- 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37, 39, 40, 43, 43, 45, 45,
- 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 50, 51, 52, 53,
- 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44, 45, 45, 47, 47,
- 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53, 55,
- 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48, 47, 46,
- 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54, 37,
- 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
- 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42,
- 42, 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50,
- 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42,
- 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
- 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47,
- 47, 47, 48, 50, 50, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54,
- 55, 55, 56, 56, 58, 58, 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48,
- 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
- 56, 57, 58, 58, 60, 60, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50,
- 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
- 60, 60, 62, 62, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52,
- 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 61, 62,
- 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54,
- 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 65, 65,
- 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
- 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67, 49, 47,
- 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59, 59,
- 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
- 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61,
- 63, 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46,
- 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63,
- 65, 65, 66, 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47,
- 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68,
- 69, 70, 71, 72, 72, 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47,
- 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70,
- 71, 72, 73, 75, 75, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51,
- 54, 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 70, 71, 72, 73, 74,
- 75, 77, 77, 54, 52, 52, 50, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55,
- 57, 58, 60, 62, 63, 65, 65, 68, 68, 70, 71, 72, 73, 74, 75, 76, 78,
- 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58, 59,
- 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75, 77, 77, 80, 80, 57,
- 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63,
- 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82, 58, 55, 55,
- 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64, 65, 67,
- 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55, 54,
- 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
- 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54,
- 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
- 76, 77, 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54,
- 55, 57, 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80,
- 82, 83, 85, 85, 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
- 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
- 85, 85, 89, 89,
- // Size 4x8
- 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45,
- 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
- // Size 8x4
- 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47,
- 45, 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
- // Size 8x16
- 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30,
- 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40,
- 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48,
- 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53,
- 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59,
- 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68,
- 72, 76, 57, 52, 50, 56, 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77,
- 82, 63, 57, 55, 60, 67, 75, 80, 86,
- // Size 16x8
- 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31,
- 31, 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38,
- 40, 43, 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46,
- 47, 47, 50, 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46,
- 45, 49, 53, 55, 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47,
- 50, 53, 56, 59, 62, 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53,
- 55, 58, 61, 64, 66, 70, 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58,
- 61, 63, 66, 69, 73, 76, 79, 82, 86,
- // Size 16x32
- 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
- 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
- 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
- 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
- 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
- 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
- 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
- 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
- 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
- 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
- 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
- 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
- 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
- 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
- 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
- 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
- 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
- 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
- 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
- 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
- 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
- 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
- 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
- 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
- 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
- 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
- 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
- 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
- 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
- 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
- 86, 89,
- // Size 32x16
- 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48,
- 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31,
- 31, 32, 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46,
- 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32,
- 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46,
- 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39,
- 40, 42, 45, 45, 46, 46, 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47,
- 48, 49, 50, 51, 51, 53, 53, 56, 56, 37, 38, 38, 39, 40, 41, 43, 44,
- 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49,
- 49, 50, 51, 52, 53, 55, 55, 42, 42, 42, 42, 42, 44, 45, 45, 47, 47,
- 48, 48, 50, 50, 50, 50, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52, 53,
- 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
- 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58,
- 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
- 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62,
- 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
- 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 49,
- 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
- 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50,
- 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
- 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48,
- 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
- 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50,
- 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
- 72, 74, 75, 76, 77, 78, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50,
- 50, 53, 53, 56, 57, 58, 60, 61, 64, 64, 67, 67, 70, 71, 72, 73, 75,
- 76, 77, 79, 79, 82, 82, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56,
- 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
- 82, 83, 86, 86, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60,
- 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
- 89, 89,
- // Size 4x16
- 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40,
- 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49,
- 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63,
- 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
- // Size 16x4
- 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42,
- 42, 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47,
- 45, 46, 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52,
- 51, 50, 53, 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
- // Size 8x32
- 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31,
- 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32,
- 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43,
- 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47,
- 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49,
- 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
- 55, 58, 49, 46, 48, 53, 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58,
- 61, 48, 46, 47, 53, 56, 57, 59, 62, 48, 45, 46, 53, 57, 59, 61, 63,
- 49, 45, 46, 53, 58, 61, 63, 66, 49, 45, 46, 53, 58, 62, 64, 66, 50,
- 46, 46, 54, 59, 64, 66, 69, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
- 47, 54, 61, 66, 69, 72, 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48,
- 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56,
- 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64,
- 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73,
- 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80,
- 86,
- // Size 32x8
- 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48,
- 49, 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31,
- 31, 32, 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45,
- 46, 46, 48, 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39,
- 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46,
- 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46,
- 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
- 55, 55, 56, 56, 57, 58, 59, 60, 60, 49, 47, 47, 46, 45, 45, 46, 45,
- 45, 46, 49, 49, 53, 53, 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62,
- 63, 64, 64, 65, 65, 67, 67, 52, 50, 50, 48, 48, 48, 47, 47, 47, 47,
- 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69, 70,
- 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
- 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77,
- 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59,
- 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86,
- 86},
- },
- // Quantizer level 7.
- {
- {// Luma
- // Size 4x4
- 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
- // Size 8x8
- 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32,
- 32, 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37,
- 40, 47, 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49,
- 56, 65, 75, 82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
- // Size 16x16
- 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31,
- 32, 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32,
- 32, 32, 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32,
- 33, 33, 33, 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33,
- 34, 34, 36, 37, 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34,
- 35, 37, 38, 40, 41, 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37,
- 39, 42, 44, 46, 47, 51, 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42,
- 48, 50, 52, 54, 57, 60, 63, 65, 68, 38, 38, 37, 38, 39, 40, 44, 50,
- 52, 54, 57, 60, 64, 67, 69, 72, 41, 40, 38, 40, 40, 41, 46, 52, 54,
- 57, 60, 63, 67, 70, 73, 75, 44, 42, 41, 42, 42, 42, 47, 54, 57, 60,
- 63, 67, 71, 74, 77, 79, 48, 46, 44, 45, 45, 46, 51, 57, 60, 63, 67,
- 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
- 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63, 67, 70, 74, 79, 86,
- 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73, 77, 82, 89, 93,
- 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85, 92, 97,
- 101, 105,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38,
- 39, 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40,
- 42, 42, 46, 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42,
- 45, 46, 48, 51, 51, 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45,
- 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49,
- 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 34, 34, 34, 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53,
- 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
- 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 49, 49, 53, 54, 56,
- 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
- 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54, 56, 59, 59,
- 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
- 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
- 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32,
- 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42,
- 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33,
- 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49,
- 50, 51, 53, 53, 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35,
- 35, 36, 37, 37, 39, 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52,
- 54, 54, 57, 58, 60, 63, 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36,
- 37, 37, 40, 41, 42, 45, 45, 46, 47, 48, 50, 50, 52, 53, 54, 56, 56,
- 59, 60, 62, 65, 65, 69, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38,
- 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64,
- 65, 68, 68, 72, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42,
- 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58, 60, 60, 63, 64, 65, 68,
- 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50,
- 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
- 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47, 50, 50, 53,
- 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77, 41, 40,
- 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55, 57,
- 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
- 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63,
- 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41,
- 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67,
- 68, 71, 71, 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45,
- 45, 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75,
- 75, 78, 79, 81, 84, 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46,
- 46, 50, 51, 53, 57, 57, 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79,
- 80, 82, 85, 85, 89, 50, 49, 48, 47, 46, 46, 47, 47, 47, 47, 47, 51,
- 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75, 78, 78, 82, 83, 85,
- 88, 88, 92, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56,
- 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92,
- 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60,
- 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92, 92, 96, 58,
- 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63, 67, 68,
- 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 59, 57,
- 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69, 71,
- 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58,
- 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77,
- 77, 81, 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61,
- 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79,
- 84, 85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59,
- 59, 59, 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84,
- 85, 88, 92, 92, 97, 98, 101, 105, 105, 109, 70, 67, 67, 65, 64, 64,
- 63, 63, 63, 62, 62, 66, 67, 69, 72, 72, 76, 77, 79, 83, 83, 88, 89,
- 92, 96, 96, 101, 102, 105, 109, 109, 114,
- // Size 4x8
- 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38,
- 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
- // Size 8x4
- 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42,
- 41, 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
- // Size 8x16
- 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31,
- 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32,
- 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37,
- 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47,
- 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57,
- 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71,
- 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89,
- 100, 65, 60, 58, 66, 72, 79, 92, 105,
- // Size 16x8
- 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31,
- 32, 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32,
- 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34,
- 35, 36, 37, 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38,
- 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42,
- 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49,
- 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63,
- 68, 72, 76, 79, 85, 92, 97, 100, 105,
- // Size 16x32
- 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31,
- 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32,
- 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32,
- 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32,
- 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33,
- 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34,
- 35, 36, 38, 42, 42, 48, 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36,
- 36, 38, 42, 42, 48, 50, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 37,
- 39, 42, 42, 48, 49, 53, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40,
- 42, 42, 48, 49, 52, 58, 58, 32, 32, 33, 34, 35, 35, 37, 38, 40, 42,
- 42, 48, 49, 52, 58, 58, 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46,
- 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53,
- 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56,
- 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63,
- 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68,
- 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72,
- 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73, 41,
- 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
- 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41,
- 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44,
- 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46,
- 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47,
- 55, 58, 61, 68, 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57,
- 60, 64, 71, 71, 79, 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60,
- 64, 71, 71, 79, 82, 86, 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67,
- 74, 74, 83, 86, 90, 97, 97, 58, 55, 54, 54, 54, 54, 61, 63, 68, 75,
- 75, 84, 87, 91, 98, 98, 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77,
- 86, 89, 93, 100, 100, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79,
- 89, 92, 97, 105, 105, 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79,
- 89, 92, 97, 105, 105, 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83,
- 93, 96, 101, 109, 109,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38,
- 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39,
- 41, 41, 44, 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41,
- 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32,
- 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45,
- 47, 50, 50, 53, 54, 56, 59, 59, 63, 32, 32, 32, 32, 33, 33, 34, 34,
- 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50,
- 50, 53, 54, 56, 58, 58, 62, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
- 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53,
- 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
- 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63,
- 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
- 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63, 65, 68, 68,
- 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
- 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44,
- 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
- 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42,
- 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
- 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47,
- 47, 48, 48, 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73,
- 74, 76, 79, 79, 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49,
- 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
- 82, 82, 86, 87, 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53,
- 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86,
- 90, 91, 93, 97, 97, 101, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
- 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97,
- 98, 100, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58,
- 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
- 100, 105, 105, 109,
- // Size 4x16
- 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32,
- 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40,
- 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71,
- 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
- // Size 16x4
- 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32,
- 32, 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42,
- 41, 42, 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53,
- 53, 53, 52, 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
- // Size 8x32
- 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31,
- 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32,
- 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34,
- 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36,
- 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40,
- 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48,
- 54, 63, 34, 34, 37, 43, 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60,
- 68, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
- 39, 37, 40, 48, 53, 58, 65, 73, 41, 39, 41, 49, 54, 60, 67, 76, 44,
- 41, 43, 51, 57, 63, 71, 79, 44, 41, 43, 51, 57, 63, 71, 79, 47, 44,
- 45, 53, 59, 66, 75, 84, 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47,
- 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57,
- 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68,
- 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72,
- 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76,
- 83, 96, 109,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38,
- 39, 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39,
- 41, 41, 44, 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32,
- 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43,
- 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
- 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54,
- 55, 57, 57, 60, 61, 63, 66, 66, 70, 39, 38, 38, 37, 37, 37, 38, 38,
- 39, 40, 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
- 64, 67, 68, 69, 72, 72, 76, 44, 42, 42, 41, 41, 41, 42, 42, 42, 42,
- 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71, 74,
- 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
- 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89,
- 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65,
- 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98, 100, 105,
- 105, 109},
- {// Chroma
- // Size 4x4
- 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
- // Size 8x8
- 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38,
- 40, 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45,
- 46, 51, 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47,
- 52, 57, 63, 66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
- // Size 16x16
- 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31,
- 31, 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31,
- 32, 34, 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34,
- 37, 40, 42, 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40,
- 42, 45, 46, 47, 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45,
- 47, 47, 48, 47, 46, 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47,
- 48, 50, 50, 49, 49, 50, 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50,
- 53, 53, 53, 53, 54, 54, 55, 56, 56, 48, 47, 45, 46, 46, 47, 50, 53,
- 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 46, 46, 46, 49, 53, 54,
- 55, 57, 58, 59, 60, 60, 61, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
- 58, 60, 61, 62, 63, 63, 50, 48, 46, 46, 46, 46, 50, 54, 56, 58, 60,
- 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47, 50, 54, 57, 59, 61, 63,
- 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55, 58, 60, 62, 65, 68,
- 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60, 63, 66, 69, 71,
- 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67, 70, 73, 74,
- 76,
- // Size 32x32
- 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48,
- 48, 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
- 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47,
- 47, 47, 48, 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31,
- 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47,
- 48, 48, 49, 50, 50, 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31,
- 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47,
- 48, 49, 49, 50, 51, 52, 53, 53, 55, 30, 31, 31, 31, 32, 32, 34, 35,
- 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47,
- 47, 49, 49, 50, 52, 52, 54, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
- 40, 42, 42, 44, 46, 46, 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49,
- 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
- 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50,
- 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
- 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51, 51,
- 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
- 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 36,
- 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
- 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38,
- 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
- 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42,
- 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49,
- 49, 49, 49, 49, 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44,
- 45, 46, 47, 47, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
- 50, 50, 51, 52, 52, 53, 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46,
- 47, 47, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52,
- 53, 53, 54, 54, 54, 56, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48,
- 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
- 56, 56, 56, 58, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50,
- 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56,
- 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53,
- 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 60,
- 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
- 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61, 49, 47,
- 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55, 55,
- 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
- 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58,
- 59, 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45,
- 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60,
- 60, 61, 61, 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46,
- 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63,
- 63, 64, 64, 65, 66, 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46,
- 46, 49, 50, 51, 54, 54, 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65,
- 65, 66, 67, 67, 68, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49,
- 50, 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 65, 65, 66, 66, 67,
- 68, 68, 70, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52,
- 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70,
- 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54, 54,
- 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68, 69, 70, 70, 72, 54,
- 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58,
- 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 54, 52, 52,
- 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58, 60, 62,
- 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52, 50,
- 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
- 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51,
- 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
- 70, 70, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50,
- 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
- 73, 73, 74, 76, 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51,
- 54, 55, 56, 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75,
- 76, 78, 78, 80,
- // Size 4x8
- 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46,
- 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
- // Size 8x4
- 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47,
- 45, 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
- // Size 8x16
- 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30,
- 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37,
- 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47,
- 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51,
- 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55,
- 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61,
- 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69,
- 74, 57, 52, 50, 55, 59, 64, 70, 76,
- // Size 16x8
- 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31,
- 31, 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38,
- 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44,
- 46, 46, 47, 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46,
- 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45,
- 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47,
- 50, 54, 57, 59, 61, 64, 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53,
- 57, 59, 61, 64, 67, 71, 73, 74, 76,
- // Size 16x32
- 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
- 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
- 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
- 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
- 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
- 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
- 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
- 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
- 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
- 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
- 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
- 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
- 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
- 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
- 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
- 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
- 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
- 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
- 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
- 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
- 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
- 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
- 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
- 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
- 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
- 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
- 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
- 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
- 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
- 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
- 78, 78,
- // Size 32x16
- 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48,
- 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
- 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46,
- 46, 46, 47, 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32,
- 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45,
- 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35,
- 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46,
- 47, 47, 47, 49, 49, 50, 51, 51, 53, 37, 38, 38, 39, 40, 40, 42, 43,
- 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47,
- 47, 48, 49, 49, 50, 50, 52, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
- 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48,
- 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
- 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54,
- 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
- 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56,
- 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
- 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49,
- 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
- 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47,
- 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
- 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47,
- 47, 47, 47, 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62,
- 63, 64, 65, 65, 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47,
- 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
- 66, 66, 68, 68, 69, 70, 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48,
- 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68,
- 70, 70, 71, 73, 73, 74, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50,
- 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
- 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53,
- 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76,
- 76, 78,
- // Size 4x16
- 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37,
- 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47,
- 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61,
- 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
- // Size 16x4
- 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37,
- 38, 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47,
- 45, 45, 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49,
- 49, 48, 48, 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
- // Size 8x32
- 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31,
- 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32,
- 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42,
- 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46,
- 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47,
- 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49,
- 50, 53, 44, 44, 47, 50, 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54,
- 57, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
- 48, 46, 47, 51, 54, 56, 57, 60, 48, 45, 46, 51, 54, 57, 59, 61, 49,
- 45, 46, 51, 55, 58, 61, 64, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
- 46, 52, 56, 59, 63, 66, 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47,
- 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53,
- 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58,
- 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
- 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72,
- 78,
- // Size 32x8
- 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48,
- 48, 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31,
- 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45,
- 45, 45, 46, 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39,
- 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46,
- 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
- 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52,
- 52, 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 45, 45, 46, 46,
- 46, 47, 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57,
- 57, 58, 58, 58, 59, 59, 61, 49, 47, 47, 46, 45, 45, 45, 46, 45, 45,
- 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59, 59, 60, 61, 61, 62,
- 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
- 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69,
- 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54,
- 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76,
- 78},
- },
- // Quantizer level 8.
- {
- {// Luma
- // Size 4x4
- 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
- // Size 8x8
- 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32,
- 32, 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34,
- 37, 42, 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45,
- 51, 57, 65, 71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
- // Size 16x16
- 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31,
- 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32,
- 32, 32, 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32,
- 32, 32, 33, 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32,
- 33, 34, 34, 35, 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34,
- 35, 36, 37, 38, 39, 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36,
- 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38,
- 39, 42, 44, 45, 47, 50, 51, 54, 58, 36, 35, 34, 35, 36, 38, 40, 42,
- 48, 50, 50, 54, 56, 57, 60, 64, 38, 37, 36, 36, 38, 39, 41, 44, 50,
- 51, 52, 56, 58, 60, 63, 67, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52,
- 54, 58, 60, 62, 65, 69, 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58,
- 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
- 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51, 57, 60, 62, 68, 71,
- 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
- 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87,
- 92,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36,
- 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37,
- 39, 39, 41, 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38,
- 41, 42, 43, 45, 46, 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42,
- 42, 45, 46, 47, 51, 51, 52, 56, 56, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44,
- 45, 46, 49, 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 44, 45,
- 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41, 44, 45, 45, 49, 49,
- 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
- 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49, 50, 51, 54,
- 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 36,
- 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
- 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32,
- 32, 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40,
- 40, 42, 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32,
- 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
- 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33,
- 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45,
- 47, 47, 48, 51, 51, 52, 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35,
- 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51,
- 51, 54, 54, 55, 58, 58, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
- 37, 38, 39, 39, 41, 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54,
- 54, 55, 58, 58, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39,
- 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51, 53, 53, 54, 57, 57, 58,
- 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42,
- 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
- 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
- 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64, 38, 37,
- 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50, 51,
- 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
- 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54,
- 56, 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37,
- 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58,
- 58, 60, 61, 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41,
- 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64,
- 65, 66, 69, 69, 70, 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42,
- 42, 42, 44, 47, 47, 50, 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68,
- 71, 71, 72, 75, 75, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43,
- 45, 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 66, 67, 68, 71, 72,
- 73, 76, 76, 47, 46, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 47, 50,
- 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76, 79,
- 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51, 53,
- 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75, 76, 77, 80, 80, 49,
- 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57,
- 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81, 53, 51, 51,
- 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59, 62, 64,
- 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51, 50,
- 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
- 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50,
- 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
- 76, 77, 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54,
- 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80,
- 81, 86, 87, 88, 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
- 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
- 87, 88, 92, 92,
- // Size 4x8
- 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35,
- 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
- // Size 8x4
- 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37,
- 36, 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
- // Size 8x16
- 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31,
- 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32,
- 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35,
- 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40,
- 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50,
- 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66,
- 69, 75, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75,
- 82, 58, 54, 54, 55, 63, 75, 79, 87,
- // Size 16x8
- 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31,
- 32, 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32,
- 32, 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33,
- 33, 34, 36, 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35,
- 36, 38, 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42,
- 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45,
- 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51,
- 54, 60, 63, 65, 71, 75, 77, 82, 87,
- // Size 16x32
- 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
- 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
- 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
- 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
- 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
- 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
- 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
- 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
- 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
- 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
- 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
- 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
- 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
- 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
- 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
- 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
- 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
- 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
- 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
- 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
- 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
- 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
- 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
- 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
- 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
- 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
- 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
- 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
- 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
- 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
- 87, 92,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36,
- 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37,
- 38, 38, 40, 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37,
- 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41,
- 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
- 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45,
- 46, 47, 49, 50, 51, 54, 54, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
- 35, 35, 36, 37, 37, 37, 38, 38, 40, 40, 40, 42, 43, 43, 45, 46, 47,
- 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
- 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51,
- 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 39, 42,
- 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58, 59, 60, 62,
- 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
- 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 38,
- 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
- 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42,
- 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
- 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41,
- 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
- 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44,
- 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
- 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50,
- 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76,
- 77, 81, 82, 83, 87, 87, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49,
- 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
- 82, 83, 87, 87, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55,
- 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
- 92, 92,
- // Size 4x16
- 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32,
- 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38,
- 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58,
- 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
- // Size 16x4
- 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32,
- 32, 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37,
- 36, 36, 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49,
- 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
- // Size 8x32
- 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31,
- 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
- 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33,
- 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34,
- 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38,
- 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
- 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50,
- 54, 35, 34, 37, 39, 45, 50, 53, 57, 36, 34, 37, 40, 48, 54, 56, 60,
- 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63, 39,
- 37, 40, 42, 50, 58, 60, 65, 39, 37, 40, 42, 50, 58, 60, 65, 42, 40,
- 42, 44, 52, 61, 64, 69, 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43,
- 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48,
- 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59,
- 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72,
- 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79,
- 87,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36,
- 36, 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36,
- 37, 37, 40, 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40,
- 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
- 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45,
- 45, 47, 48, 48, 51, 51, 53, 55, 55, 36, 35, 35, 35, 35, 34, 35, 36,
- 36, 37, 38, 38, 40, 42, 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56,
- 56, 57, 59, 60, 61, 63, 63, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42,
- 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67, 67,
- 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
- 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75,
- 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54,
- 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87,
- 87},
- {// Chroma
- // Size 4x4
- 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
- // Size 8x8
- 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35,
- 36, 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46,
- 47, 50, 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46,
- 50, 54, 59, 61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
- // Size 16x16
- 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31,
- 31, 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31,
- 31, 32, 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32,
- 33, 36, 40, 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36,
- 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43,
- 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47,
- 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48,
- 48, 50, 50, 49, 49, 50, 50, 50, 52, 49, 47, 46, 46, 47, 48, 49, 50,
- 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 46, 46, 46, 47, 48, 50, 53,
- 53, 54, 55, 55, 55, 56, 57, 48, 47, 46, 45, 46, 46, 48, 49, 53, 54,
- 54, 55, 56, 56, 57, 58, 49, 47, 45, 45, 45, 45, 47, 49, 53, 55, 55,
- 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
- 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61,
- 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
- 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68,
- 71,
- // Size 32x32
- 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49,
- 49, 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
- 31, 31, 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47,
- 47, 47, 47, 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31,
- 31, 31, 32, 34, 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47,
- 47, 47, 47, 48, 48, 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31,
- 32, 34, 34, 36, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46,
- 47, 48, 48, 48, 49, 49, 50, 52, 52, 30, 31, 31, 31, 31, 31, 32, 35,
- 35, 36, 39, 39, 41, 42, 42, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46,
- 47, 47, 48, 48, 48, 50, 50, 30, 31, 31, 31, 31, 32, 32, 35, 35, 36,
- 40, 40, 41, 42, 42, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46,
- 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32, 33, 35, 36, 37, 40, 40,
- 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45, 46, 46, 46, 47, 47,
- 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40, 43, 43, 43, 44,
- 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49,
- 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45, 45, 46,
- 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 34,
- 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
- 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38,
- 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46,
- 46, 45, 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39,
- 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
- 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41,
- 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
- 47, 47, 47, 48, 48, 48, 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45,
- 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50,
- 50, 50, 50, 51, 52, 52, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
- 47, 48, 48, 48, 49, 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50,
- 50, 51, 52, 52, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48,
- 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52,
- 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50,
- 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
- 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
- 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 47,
- 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53,
- 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
- 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54,
- 55, 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45,
- 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55,
- 56, 56, 56, 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45,
- 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58,
- 59, 59, 60, 60, 60, 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45,
- 45, 45, 47, 49, 49, 51, 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60,
- 61, 61, 61, 62, 62, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
- 47, 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 59, 60, 60, 61, 61,
- 62, 63, 63, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50,
- 50, 52, 54, 54, 55, 56, 56, 58, 59, 59, 61, 61, 61, 63, 63, 63, 64,
- 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 52,
- 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 65, 51,
- 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54,
- 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66, 52, 50, 49,
- 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54, 56, 57,
- 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49, 48,
- 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
- 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48,
- 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
- 63, 64, 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49,
- 49, 48, 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65,
- 66, 68, 68, 69, 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
- 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
- 68, 69, 71, 71,
- // Size 4x8
- 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47,
- 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
- // Size 8x4
- 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47,
- 46, 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
- // Size 8x16
- 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30,
- 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36,
- 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46,
- 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49,
- 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53,
- 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59,
- 61, 63, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63,
- 66, 54, 50, 49, 50, 55, 62, 65, 68,
- // Size 16x8
- 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31,
- 31, 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37,
- 38, 38, 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40,
- 41, 44, 47, 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46,
- 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46,
- 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
- 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48,
- 50, 54, 56, 57, 61, 63, 64, 66, 68,
- // Size 16x32
- 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
- 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
- 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
- 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
- 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
- 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
- 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
- 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
- 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
- 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
- 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
- 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
- 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
- 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
- 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
- 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
- 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
- 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
- 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
- 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
- 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
- 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
- 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
- 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
- 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
- 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
- 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
- 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
- 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
- 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
- 68, 71,
- // Size 32x16
- 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49,
- 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
- 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47,
- 46, 46, 46, 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31,
- 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46,
- 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33,
- 33, 36, 36, 37, 41, 41, 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45,
- 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
- 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46,
- 46, 47, 47, 47, 48, 49, 49, 37, 38, 38, 38, 39, 40, 40, 43, 43, 44,
- 47, 47, 47, 47, 47, 47, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47,
- 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
- 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48,
- 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 50,
- 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 54, 55,
- 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
- 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 48,
- 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
- 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47,
- 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
- 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45,
- 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
- 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46,
- 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
- 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47,
- 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64,
- 64, 66, 66, 67, 68, 68, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47,
- 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
- 66, 67, 68, 68, 54, 53, 52, 52, 50, 50, 50, 49, 49, 49, 48, 48, 50,
- 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63, 65, 65, 66, 68, 68, 69,
- 71, 71,
- // Size 4x16
- 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35,
- 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48,
- 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55,
- 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
- // Size 16x4
- 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37,
- 38, 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47,
- 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48,
- 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
- // Size 8x32
- 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31,
- 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
- 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38,
- 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44,
- 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47,
- 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
- 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50,
- 50, 45, 44, 47, 48, 51, 51, 52, 52, 49, 46, 48, 49, 53, 53, 54, 54,
- 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56, 48,
- 46, 46, 48, 53, 56, 56, 57, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
- 46, 47, 53, 57, 58, 60, 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46,
- 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48,
- 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54,
- 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61,
- 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65,
- 68,
- // Size 32x8
- 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49,
- 49, 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31,
- 31, 31, 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46,
- 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37,
- 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46,
- 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
- 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47,
- 47, 48, 48, 48, 48, 48, 49, 50, 50, 48, 47, 47, 47, 46, 46, 46, 47,
- 47, 47, 47, 47, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54,
- 54, 54, 54, 54, 54, 55, 55, 49, 48, 47, 47, 45, 45, 45, 45, 46, 45,
- 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59, 60,
- 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
- 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63,
- 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50,
- 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68,
- 68},
- },
- // Quantizer level 9.
- {
- {// Luma
- // Size 4x4
- 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
- // Size 8x8
- 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32,
- 32, 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34,
- 35, 37, 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42,
- 42, 49, 56, 63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31,
- 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32,
- 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33,
- 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34,
- 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36,
- 36, 38, 39, 40, 42, 43, 44, 47, 47, 34, 34, 34, 33, 35, 35, 37, 38,
- 39, 42, 42, 45, 46, 47, 50, 51, 35, 35, 34, 34, 36, 36, 38, 39, 42,
- 46, 47, 49, 50, 52, 55, 55, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
- 48, 50, 52, 54, 56, 57, 39, 38, 38, 37, 39, 39, 40, 42, 45, 49, 50,
- 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
- 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60,
- 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
- 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70,
- 71,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34,
- 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
- 35, 35, 37, 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35,
- 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38,
- 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39,
- 42, 42, 42, 45, 45, 45, 48, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41,
- 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 38, 41, 41, 41, 44,
- 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42, 44, 45, 45,
- 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
- 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 31,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
- 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36,
- 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39,
- 40, 40, 41, 42, 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40,
- 41, 42, 42, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42,
- 42, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
- 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 44, 44, 45,
- 47, 47, 47, 50, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37,
- 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51,
- 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38,
- 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48, 50, 51, 51, 53,
- 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
- 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54, 35, 35,
- 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43, 46,
- 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
- 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48,
- 50, 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34,
- 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50,
- 50, 52, 54, 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36,
- 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54,
- 56, 56, 57, 58, 59, 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39,
- 39, 40, 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58,
- 58, 60, 61, 61, 63, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40,
- 40, 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60,
- 61, 61, 63, 41, 40, 40, 40, 39, 38, 38, 39, 40, 40, 40, 41, 41, 41,
- 43, 46, 46, 47, 50, 52, 52, 54, 55, 55, 57, 60, 60, 60, 62, 63, 63,
- 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47,
- 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 44,
- 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49,
- 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 45, 44, 43,
- 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49, 53, 54,
- 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45, 45,
- 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
- 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44,
- 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
- 63, 67, 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45,
- 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
- 67, 68, 70, 71, 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48,
- 48, 48, 48, 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70,
- 73, 74, 74, 77,
- // Size 4x8
- 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34,
- 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
- // Size 8x4
- 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35,
- 34, 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
- // Size 8x16
- 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31,
- 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32,
- 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34,
- 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37,
- 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46,
- 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
- 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66,
- 69, 48, 45, 45, 46, 54, 56, 67, 70,
- // Size 16x8
- 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32,
- 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32,
- 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34,
- 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36,
- 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
- 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45,
- 47, 50, 55, 56, 60, 62, 66, 69, 70,
- // Size 16x32
- 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
- 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
- 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
- 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
- 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
- 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
- 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
- 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
- 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
- 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
- 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
- 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
- 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
- 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
- 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
- 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
- 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
- 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
- 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
- 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
- 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
- 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
- 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
- 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
- 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
- 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
- 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
- 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
- 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
- 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
- 73, 79,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34,
- 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
- 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34,
- 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37,
- 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40,
- 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
- 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43,
- 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
- 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45,
- 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
- 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45, 47, 48, 48,
- 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
- 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36,
- 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
- 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35,
- 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
- 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39,
- 38, 38, 38, 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53,
- 54, 54, 56, 59, 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41,
- 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
- 60, 63, 63, 64, 66, 67, 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42,
- 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63,
- 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45,
- 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
- 69, 70, 70, 73, 53, 52, 51, 51, 50, 49, 49, 49, 49, 50, 50, 49, 49,
- 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65, 67, 71, 71, 72, 75, 76,
- 76, 79,
- // Size 4x16
- 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32,
- 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37,
- 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51,
- 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
- // Size 16x4
- 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32,
- 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35,
- 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41,
- 41, 42, 42, 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
- // Size 8x32
- 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31,
- 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
- 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32,
- 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34,
- 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36,
- 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38,
- 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44,
- 47, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 41, 42, 48, 50,
- 34, 34, 35, 37, 42, 43, 49, 51, 35, 34, 36, 38, 45, 47, 52, 55, 36,
- 34, 36, 38, 46, 48, 54, 56, 36, 34, 36, 38, 46, 48, 54, 56, 38, 36,
- 37, 40, 47, 49, 56, 58, 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39,
- 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
- 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51,
- 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
- 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69,
- 73,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34,
- 34, 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36,
- 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33,
- 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40,
- 40, 41, 43, 43, 43, 45, 46, 46, 48, 35, 35, 35, 35, 34, 34, 34, 34,
- 35, 36, 36, 37, 37, 37, 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49,
- 51, 51, 51, 53, 54, 54, 56, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36,
- 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49, 50, 50, 51, 53, 53,
- 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
- 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66,
- 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45,
- 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70,
- 73},
- {// Chroma
- // Size 4x4
- 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
- // Size 8x8
- 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34,
- 35, 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43,
- 45, 47, 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45,
- 45, 50, 55, 58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
- // Size 16x16
- 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31,
- 31, 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31,
- 31, 31, 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31,
- 32, 34, 35, 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34,
- 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38,
- 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43,
- 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47,
- 47, 48, 48, 49, 48, 47, 47, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48,
- 48, 50, 50, 49, 49, 49, 50, 50, 47, 46, 46, 45, 46, 47, 47, 48, 50,
- 52, 52, 52, 52, 52, 53, 53, 49, 47, 47, 46, 47, 47, 48, 49, 50, 52,
- 53, 53, 53, 53, 54, 54, 48, 47, 46, 45, 46, 46, 46, 48, 49, 52, 53,
- 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
- 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57,
- 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
- 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61,
- 61,
- // Size 32x32
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41,
- 43, 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31,
- 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47,
- 48, 48, 48, 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31,
- 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47,
- 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31,
- 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47,
- 47, 47, 47, 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32,
- 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 46, 46, 46, 46,
- 46, 46, 46, 47, 47, 47, 48, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
- 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45,
- 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
- 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46,
- 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
- 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46,
- 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
- 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33,
- 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
- 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34,
- 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47,
- 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37,
- 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
- 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40,
- 40, 42, 43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46,
- 46, 45, 45, 45, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42,
- 43, 43, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45,
- 45, 45, 46, 46, 46, 46, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44,
- 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
- 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47,
- 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50,
- 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48,
- 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50,
- 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
- 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 47, 47,
- 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52,
- 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
- 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53,
- 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46,
- 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53,
- 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46,
- 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54,
- 55, 55, 55, 55, 55, 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46,
- 46, 46, 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55,
- 56, 56, 56, 56, 57, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
- 46, 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56,
- 56, 56, 57, 49, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46,
- 47, 49, 49, 50, 52, 53, 53, 54, 55, 55, 55, 57, 57, 57, 57, 58, 58,
- 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49,
- 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49,
- 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
- 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60, 49, 48, 47,
- 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53,
- 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48, 47,
- 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
- 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46,
- 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
- 58, 60, 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46,
- 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
- 60, 60, 61, 61, 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47,
- 47, 46, 46, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61,
- 62, 63, 63, 64,
- // Size 4x8
- 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43,
- 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
- // Size 8x4
- 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47,
- 46, 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
- // Size 8x16
- 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31,
- 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35,
- 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43,
- 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47,
- 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52,
- 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
- 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59,
- 61, 50, 46, 46, 46, 52, 54, 59, 61,
- // Size 16x8
- 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31,
- 31, 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34,
- 34, 35, 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39,
- 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44,
- 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47,
- 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
- 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46,
- 47, 50, 53, 54, 56, 57, 59, 61, 61,
- // Size 16x32
- 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
- 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
- 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
- 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
- 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
- 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
- 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
- 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
- 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
- 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
- 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
- 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
- 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
- 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
- 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
- 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
- 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
- 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
- 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
- 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
- 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
- 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
- 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
- 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
- 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
- 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
- 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
- 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
- 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
- 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
- 62, 65,
- // Size 32x16
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42,
- 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31,
- 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46,
- 47, 47, 47, 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31,
- 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46,
- 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32,
- 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46,
- 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
- 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46,
- 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
- 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46,
- 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
- 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46,
- 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
- 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 48, 48, 48,
- 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
- 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48,
- 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
- 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47,
- 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
- 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46,
- 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54,
- 54, 54, 55, 56, 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45,
- 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
- 57, 58, 58, 58, 59, 59, 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45,
- 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58,
- 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
- 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
- 61, 61, 61, 62, 52, 51, 50, 50, 49, 48, 48, 48, 47, 47, 47, 47, 47,
- 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 59, 61, 61, 62, 63, 64,
- 64, 65,
- // Size 4x16
- 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34,
- 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47,
- 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53,
- 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
- // Size 16x4
- 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37,
- 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47,
- 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46,
- 45, 45, 46, 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
- // Size 8x32
- 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31,
- 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
- 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35,
- 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42,
- 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46,
- 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47,
- 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47,
- 47, 42, 43, 44, 47, 49, 50, 49, 50, 42, 43, 44, 47, 49, 50, 49, 50,
- 43, 43, 45, 47, 50, 50, 50, 50, 47, 46, 46, 48, 51, 52, 53, 53, 49,
- 46, 47, 48, 52, 53, 53, 54, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
- 46, 47, 52, 53, 55, 55, 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46,
- 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
- 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52,
- 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
- 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60,
- 62,
- // Size 32x8
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42,
- 43, 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31,
- 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46,
- 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34,
- 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47,
- 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
- 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47,
- 47, 46, 46, 46, 46, 46, 46, 46, 47, 45, 45, 45, 45, 45, 44, 44, 45,
- 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51,
- 51, 51, 52, 52, 52, 52, 52, 48, 47, 47, 47, 46, 46, 46, 46, 47, 47,
- 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 53, 53, 53, 53, 53,
- 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
- 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59,
- 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46,
- 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61,
- 62},
- },
- // Quantizer level 10.
- {
- {// Luma
- // Size 4x4
- 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
- // Size 8x8
- 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32,
- 32, 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33,
- 34, 35, 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37,
- 38, 41, 46, 51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32,
- 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33,
- 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34,
- 35, 35, 37, 37, 38, 38, 40, 40, 42, 32, 32, 32, 32, 32, 34, 34, 35,
- 35, 37, 37, 38, 38, 40, 40, 42, 34, 34, 34, 33, 33, 35, 35, 37, 37,
- 39, 39, 42, 42, 45, 45, 47, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39,
- 39, 42, 42, 45, 45, 47, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42,
- 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
- 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50,
- 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54,
- 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58,
- 63,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
- 34, 34, 34, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
- 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
- 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
- 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 38,
- 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37,
- 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41,
- 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
- 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
- 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
- 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36,
- 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 38,
- 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
- 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
- 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
- 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
- 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
- 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42,
- 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
- 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45, 34, 34,
- 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39,
- 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
- 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39,
- 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34,
- 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42,
- 42, 42, 44, 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34,
- 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45,
- 46, 47, 47, 47, 49, 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35,
- 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50,
- 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
- 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50,
- 52, 54, 54, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37,
- 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54,
- 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 39,
- 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52, 52, 54, 56, 56, 39,
- 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
- 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38,
- 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45,
- 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38, 38,
- 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
- 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39,
- 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
- 52, 54, 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41,
- 41, 42, 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56,
- 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
- 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
- 58, 60, 63, 63,
- // Size 4x8
- 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33,
- 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
- // Size 8x4
- 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34,
- 34, 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
- // Size 8x16
- 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31,
- 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32,
- 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32,
- 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35,
- 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37,
- 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
- 48, 54, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50,
- 58, 44, 41, 41, 43, 43, 53, 53, 63,
- // Size 16x8
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31,
- 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32,
- 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33,
- 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34,
- 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36,
- 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42,
- 42, 42, 48, 48, 54, 54, 58, 58, 63,
- // Size 16x32
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
- 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
- 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
- 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
- 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
- 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
- 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
- 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
- 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
- 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
- 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
- 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
- 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
- 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
- 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
- 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
- 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
- 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
- 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
- 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
- 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
- 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
- 63, 63,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
- 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
- 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34,
- 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
- 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39,
- 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
- 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40,
- 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
- 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
- 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
- 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 34,
- 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
- 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35,
- 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
- 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35,
- 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
- 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34,
- 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
- 48, 49, 50, 50, 50, 52, 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37,
- 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 51, 51, 51, 52,
- 54, 54, 54, 56, 58, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
- 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
- 58, 60, 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42,
- 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60,
- 63, 63,
- // Size 4x16
- 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32,
- 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34,
- 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43,
- 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
- // Size 16x4
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34,
- 34, 33, 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38,
- 37, 37, 39, 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
- // Size 8x32
- 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31,
- 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
- 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32,
- 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
- 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34,
- 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
- 36, 42, 32, 32, 32, 34, 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38,
- 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
- 33, 33, 33, 36, 36, 40, 40, 45, 34, 34, 34, 37, 37, 42, 42, 48, 34,
- 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 35, 34,
- 34, 37, 37, 45, 45, 50, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34,
- 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39,
- 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40,
- 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52,
- 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53,
- 63,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 33, 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34,
- 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
- 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38,
- 38, 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38,
- 39, 40, 40, 40, 42, 43, 43, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35,
- 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50,
- 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
- 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50,
- 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
- 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
- 63},
- {// Chroma
- // Size 4x4
- 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
- // Size 8x8
- 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32,
- 33, 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41,
- 42, 45, 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46,
- 47, 48, 51, 53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
- // Size 16x16
- 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31,
- 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31,
- 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31,
- 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32,
- 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35,
- 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39,
- 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43,
- 47, 47, 47, 47, 48, 48, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47,
- 47, 47, 47, 48, 48, 46, 46, 45, 41, 42, 42, 42, 42, 45, 45, 47, 47,
- 48, 48, 50, 50, 49, 49, 49, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48,
- 48, 50, 50, 49, 49, 49, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50,
- 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
- 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53,
- 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
- 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55,
- 58,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36,
- 39, 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42,
- 42, 42, 45, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42,
- 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47,
- 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47,
- 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
- 35, 35, 35, 37, 39, 39, 39, 41, 42, 42, 42, 44, 47, 47, 47, 46, 46,
- 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35,
- 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45,
- 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
- 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45,
- 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37, 40, 40,
- 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45, 32,
- 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
- 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34,
- 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
- 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34,
- 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
- 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35,
- 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47,
- 47, 47, 46, 46, 46, 46, 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37,
- 39, 41, 41, 41, 43, 45, 45, 45, 45, 46, 46, 46, 47, 47, 47, 47, 47,
- 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43,
- 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
- 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43,
- 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46,
- 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47,
- 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45,
- 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
- 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47, 41, 42,
- 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48,
- 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
- 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48,
- 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42,
- 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50,
- 50, 50, 50, 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44,
- 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51,
- 51, 51, 51, 51, 51, 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47,
- 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53,
- 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47,
- 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53,
- 53, 53, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47,
- 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53,
- 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47,
- 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 48,
- 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
- 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47,
- 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49,
- 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47, 47,
- 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
- 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45,
- 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
- 53, 54, 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45,
- 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54,
- 55, 55, 55, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
- 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
- 55, 57, 58, 58,
- // Size 4x8
- 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40,
- 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
- // Size 8x4
- 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42,
- 42, 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
- // Size 8x16
- 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31,
- 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32,
- 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36,
- 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47,
- 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47,
- 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
- 53, 53, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53,
- 56, 49, 45, 45, 46, 46, 53, 53, 58,
- // Size 16x8
- 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31,
- 31, 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31,
- 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38,
- 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40,
- 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46,
- 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47,
- 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46,
- 45, 45, 49, 49, 53, 53, 56, 56, 58,
- // Size 16x32
- 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
- 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
- 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
- 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
- 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
- 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
- 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
- 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
- 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
- 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
- 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
- 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
- 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
- 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
- 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
- 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
- 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
- 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
- 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
- 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
- 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
- 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
- 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
- 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
- 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
- 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
- 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
- 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
- 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
- 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
- 58, 58,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37,
- 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42,
- 42, 42, 45, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43,
- 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46,
- 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46,
- 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37,
- 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46,
- 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
- 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47,
- 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
- 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46,
- 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
- 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 42,
- 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
- 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47,
- 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
- 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47,
- 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
- 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46,
- 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
- 53, 53, 53, 53, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45,
- 46, 46, 46, 46, 46, 46, 46, 46, 48, 50, 50, 50, 51, 53, 53, 53, 54,
- 54, 54, 54, 55, 56, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46,
- 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
- 56, 57, 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46,
- 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57,
- 58, 58,
- // Size 4x16
- 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31,
- 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43,
- 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50,
- 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
- // Size 16x4
- 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33,
- 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42,
- 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47,
- 45, 45, 46, 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
- // Size 8x32
- 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31,
- 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
- 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32,
- 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
- 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43,
- 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
- 47, 46, 35, 38, 38, 45, 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47,
- 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
- 39, 41, 41, 47, 47, 49, 49, 47, 42, 43, 43, 47, 47, 50, 50, 49, 42,
- 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 45, 44,
- 44, 47, 47, 51, 51, 51, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46,
- 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
- 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47,
- 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53,
- 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53,
- 58,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37,
- 39, 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43,
- 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43,
- 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39,
- 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48,
- 48, 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40,
- 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48,
- 47, 47, 47, 47, 46, 46, 46, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
- 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53,
- 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
- 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53,
- 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45,
- 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58,
- 58},
- },
- // Quantizer level 11.
- {
- {// Luma
- // Size 4x4
- 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
- // Size 8x8
- 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31,
- 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32,
- 32, 34, 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33,
- 35, 36, 38, 39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33,
- 33, 33, 34, 34, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 33,
- 34, 34, 34, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 33, 34, 34,
- 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
- 35, 36, 37, 37, 38, 38, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 36,
- 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
- 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41,
- 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45,
- 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
- 48,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
- 34, 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
- 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
- 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
- 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35,
- 35, 35, 35, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
- 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36,
- 36, 36, 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36,
- 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
- 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35,
- 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35,
- 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36,
- 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38,
- 38, 38, 39, 40, 40, 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40,
- 41, 42, 42, 42, 42, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34,
- 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42,
- 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35,
- 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42,
- 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35,
- 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34,
- 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37,
- 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45, 35, 35, 35,
- 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 38, 38,
- 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35, 35,
- 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
- 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35,
- 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
- 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34,
- 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
- 45, 47, 48, 48, 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35,
- 36, 37, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48,
- 49, 49, 49, 50,
- // Size 4x8
- 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32,
- 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
- // Size 8x4
- 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32,
- 32, 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
- // Size 8x16
- 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31,
- 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32,
- 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32,
- 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33,
- 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35,
- 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37,
- 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46,
- 48, 36, 35, 34, 36, 38, 38, 46, 48,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32,
- 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32,
- 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33,
- 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34,
- 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36,
- 36, 37, 38, 38, 41, 42, 44, 48, 48,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
- 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
- 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
- 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
- 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
- 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
- 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
- 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
- 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
- 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
- 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
- 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
- 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
- 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
- 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
- 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
- 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
- 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
- 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
- 49, 49,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
- 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
- 34, 34, 34, 34, 34, 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36,
- 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
- 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 37,
- 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
- 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
- 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
- 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33,
- 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37,
- 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34,
- 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
- 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34,
- 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42,
- 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34,
- 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
- 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36,
- 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48,
- 48, 49,
- // Size 4x16
- 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32,
- 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32,
- 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37,
- 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
- // Size 16x4
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32,
- 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35,
- 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
- // Size 8x32
- 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31,
- 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
- 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32,
- 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
- 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33,
- 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34,
- 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36,
- 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
- 32, 32, 33, 33, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32,
- 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 33,
- 33, 34, 36, 36, 39, 40, 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34,
- 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
- 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38,
- 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
- 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46,
- 49,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
- 35, 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37,
- 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
- 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
- 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46,
- 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
- 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48,
- 49},
- {// Chroma
- // Size 4x4
- 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
- // Size 8x8
- 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30,
- 31, 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37,
- 39, 42, 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42,
- 45, 47, 48, 48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
- // Size 16x16
- 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31,
- 31, 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
- 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31,
- 31, 31, 31, 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31,
- 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32,
- 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34,
- 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38,
- 39, 40, 43, 43, 44, 45, 46, 47, 47, 34, 36, 36, 36, 37, 37, 40, 40,
- 42, 45, 45, 45, 46, 46, 47, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45,
- 47, 47, 47, 47, 47, 48, 48, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47,
- 47, 47, 47, 47, 48, 48, 40, 41, 41, 41, 42, 42, 44, 44, 45, 47, 47,
- 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
- 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49,
- 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
- 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53,
- 53,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34,
- 36, 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31,
- 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37,
- 37, 37, 39, 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38,
- 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41,
- 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42,
- 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44,
- 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
- 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44, 46, 47,
- 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
- 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46, 46, 46, 46,
- 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
- 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30,
- 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
- 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40,
- 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42,
- 43, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44,
- 44, 44, 45, 46, 47, 47, 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35,
- 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45,
- 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
- 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47,
- 47, 47, 47, 47, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38,
- 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47,
- 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40,
- 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 47,
- 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
- 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37,
- 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47,
- 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
- 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47,
- 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38,
- 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47,
- 47, 47, 47, 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41,
- 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48,
- 48, 48, 48, 49, 49, 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42,
- 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49,
- 49, 50, 50, 50, 49, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43,
- 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50,
- 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45,
- 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50,
- 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45,
- 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 44,
- 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47,
- 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51, 47, 47, 46,
- 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47, 47, 47,
- 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47, 47,
- 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
- 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47,
- 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
- 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46,
- 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
- 51, 52, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46,
- 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
- 53, 53, 53, 53,
- // Size 4x8
- 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37,
- 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
- // Size 8x4
- 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37,
- 38, 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
- // Size 8x16
- 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31,
- 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32,
- 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35,
- 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40,
- 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47,
- 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47,
- 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52,
- 53, 49, 47, 46, 47, 48, 48, 52, 53,
- // Size 16x8
- 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31,
- 31, 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31,
- 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34,
- 34, 35, 35, 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39,
- 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40,
- 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44,
- 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47,
- 47, 47, 47, 47, 49, 50, 51, 53, 53,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
- 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
- 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
- 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
- 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
- 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
- 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
- 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
- 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
- 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
- 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
- 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
- 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
- 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
- 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
- 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
- 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
- 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
- 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
- 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
- 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
- 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
- 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
- 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
- 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
- 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
- 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
- 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
- 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
- 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
- 53, 53,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35,
- 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38,
- 38, 38, 39, 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39,
- 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42,
- 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43,
- 43, 44, 46, 46, 46, 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44,
- 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
- 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47,
- 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
- 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47, 48, 48, 48,
- 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
- 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37,
- 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
- 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38,
- 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
- 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41,
- 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48,
- 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45,
- 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
- 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46,
- 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50,
- 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
- 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
- 53, 53, 53, 53, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
- 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53,
- 53, 53,
- // Size 4x16
- 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31,
- 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37,
- 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47,
- 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31,
- 31, 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38,
- 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47,
- 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
- // Size 8x32
- 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31,
- 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
- 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32,
- 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35,
- 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40,
- 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42,
- 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46,
- 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
- 36, 38, 39, 42, 46, 46, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37,
- 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 39, 40,
- 41, 43, 47, 47, 48, 48, 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43,
- 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44,
- 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48,
- 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
- 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52,
- 53,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35,
- 36, 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39,
- 39, 39, 40, 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40,
- 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44,
- 44, 44, 44, 45, 46, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 39, 39,
- 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47,
- 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40,
- 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
- 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
- 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52,
- 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47,
- 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
- 53},
- },
- // Quantizer level 12.
- {
- {// Luma
- // Size 4x4
- 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
- // Size 8x8
- 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31,
- 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32,
- 32, 32, 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32,
- 32, 34, 34, 35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 34, 34, 34, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
- 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35,
- 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
- 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38,
- 39,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
- 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
- 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
- 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
- 34, 34, 34, 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
- 35, 35, 35, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
- 36, 36, 36, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37,
- 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
- 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33, 33,
- 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
- 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
- 36, 36, 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34,
- 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
- 37, 37, 38, 38, 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
- 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
- 38, 38, 39, 39,
- // Size 4x8
- 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32,
- 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
- // Size 8x4
- 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32,
- 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
- // Size 8x16
- 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31,
- 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
- 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32,
- 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
- 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33,
- 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35,
- 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36,
- 36, 34, 34, 34, 34, 35, 37, 37, 38,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32,
- 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33,
- 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33,
- 33, 34, 34, 34, 35, 36, 36, 36, 38,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
- 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
- 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
- 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
- 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
- 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
- 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
- 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
- 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
- 38, 39,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
- 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
- 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
- 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
- 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
- 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
- 36, 37, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
- 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
- 39, 39,
- // Size 4x16
- 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
- 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32,
- 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34,
- 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
- // Size 16x4
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32,
- 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
- // Size 8x32
- 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31,
- 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
- 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32,
- 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
- 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32,
- 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
- 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33,
- 33, 31, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 33, 34, 34, 34,
- 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32,
- 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
- 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33,
- 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
- 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34,
- 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36,
- 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37,
- 38,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
- 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
- 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
- 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
- 38},
- {// Chroma
- // Size 4x4
- 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
- // Size 8x8
- 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31,
- 31, 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34,
- 35, 35, 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39,
- 40, 43, 44, 47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
- // Size 16x16
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31,
- 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31,
- 31, 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31,
- 31, 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31,
- 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31,
- 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32,
- 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32,
- 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 34, 34, 35,
- 37, 38, 38, 41, 42, 42, 43, 44, 33, 34, 34, 34, 35, 35, 35, 36, 38,
- 39, 39, 41, 43, 43, 44, 45, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39,
- 39, 41, 43, 43, 44, 45, 35, 36, 37, 37, 37, 38, 38, 38, 41, 41, 41,
- 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
- 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47,
- 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
- 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48,
- 48,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
- 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34,
- 34, 34, 35, 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
- 35, 36, 37, 37, 37, 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36,
- 38, 38, 38, 38, 38, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38,
- 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38,
- 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40,
- 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39, 40, 41, 42,
- 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
- 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 30,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
- 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35,
- 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
- 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39,
- 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
- 40, 40, 41, 42, 42, 42, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 40, 40, 40, 41,
- 41, 42, 43, 43, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41, 41, 41, 41, 42, 42, 43,
- 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
- 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43, 43, 44, 44, 44,
- 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
- 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34,
- 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39,
- 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
- 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39,
- 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41,
- 43, 43, 43, 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36,
- 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44,
- 44, 44, 44, 45, 45, 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37,
- 38, 38, 38, 38, 38, 39, 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46,
- 46, 46, 46, 46, 46, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39,
- 39, 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 47, 47, 47, 47, 47,
- 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
- 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47,
- 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
- 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 36,
- 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43,
- 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38,
- 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43,
- 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40,
- 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
- 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41,
- 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
- 47, 47, 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42,
- 42, 42, 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47,
- 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
- 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
- 48, 48, 48, 48,
- // Size 4x8
- 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34,
- 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
- // Size 8x4
- 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35,
- 36, 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
- // Size 8x16
- 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31,
- 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31,
- 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32,
- 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35,
- 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38,
- 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47,
- 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47,
- 47, 42, 42, 43, 43, 44, 47, 47, 48,
- // Size 16x8
- 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31,
- 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31,
- 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31,
- 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34,
- 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39,
- 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40,
- 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41,
- 41, 43, 44, 44, 46, 47, 47, 47, 48,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
- 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
- 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
- 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
- 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
- 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
- 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
- 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
- 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
- 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
- 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
- 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
- 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
- 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
- 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
- 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
- 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
- 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
- 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
- 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
- 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
- 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
- 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
- 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
- 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
- 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
- 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
- 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
- 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
- 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
- 48, 48,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
- 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
- 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35,
- 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38,
- 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40,
- 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40,
- 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41,
- 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41, 42, 42, 43,
- 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
- 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 35,
- 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
- 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38,
- 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
- 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38,
- 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
- 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38,
- 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
- 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39,
- 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
- 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41,
- 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
- 47, 47, 48, 48, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
- 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
- 48, 48,
- // Size 4x16
- 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31,
- 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35,
- 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45,
- 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31,
- 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35,
- 36, 36, 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38,
- 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
- // Size 8x32
- 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31,
- 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31,
- 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31,
- 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
- 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35,
- 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
- 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40,
- 41, 32, 33, 34, 34, 36, 41, 41, 42, 33, 34, 35, 35, 37, 42, 42, 43,
- 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33,
- 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 34, 36,
- 37, 37, 39, 44, 44, 45, 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39,
- 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
- 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43,
- 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47,
- 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47,
- 48,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33,
- 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35,
- 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36,
- 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38,
- 39, 40, 40, 40, 40, 41, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 35, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43,
- 43, 43, 43, 43, 44, 44, 44, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39,
- 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47,
- 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
- 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47,
- 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41,
- 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48,
- 48},
- },
- // Quantizer level 13.
- {
- {// Luma
- // Size 4x4
- 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
- // Size 8x8
- 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
- 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
- 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 33, 33, 33, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
- 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 33,
- // Size 4x8
- 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
- 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
- // Size 8x4
- 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- // Size 8x16
- 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31,
- 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
- 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
- 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
- 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
- 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
- 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33,
- 34, 32, 32, 32, 32, 32, 32, 33, 34,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 34, 34, 34,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
- 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
- 34, 34,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
- 34, 34, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
- 34, 34,
- // Size 4x16
- 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
- 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
- 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
- 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- // Size 8x32
- 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31,
- 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
- 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
- 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
- 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
- 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
- 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
- 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
- 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31,
- 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
- 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32,
- 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
- 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32,
- 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
- 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33,
- 34,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
- 34},
- {// Chroma
- // Size 4x4
- 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
- // Size 8x8
- 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31,
- 31, 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31,
- 31, 31, 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34,
- 35, 35, 36, 39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31,
- 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 33, 34, 35, 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
- 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
- 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38,
- 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
- 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39,
- 39,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
- 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
- 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
- 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
- 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34,
- 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34,
- 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34,
- 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 32, 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33,
- 34, 35, 35, 35, 35, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35,
- 35, 35, 35, 35, 35, 36, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 35, 35, 35,
- 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35,
- 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
- 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
- 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
- 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
- 36, 36, 36, 36, 36, 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37,
- 37, 37, 37, 37, 37, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
- 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37, 37,
- 37, 37, 38, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
- 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 38, 38, 38, 38, 38, 38,
- 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
- 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33,
- 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
- 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34,
- 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35,
- 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34,
- 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
- 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34,
- 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
- 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
- 39, 39, 39, 39, 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
- 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40,
- 40, 40, 40, 40,
- // Size 4x8
- 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31,
- 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
- // Size 8x4
- 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31,
- 31, 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
- // Size 8x16
- 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31,
- 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
- 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31,
- 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32,
- 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32,
- 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35,
- 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38,
- 41, 33, 34, 35, 36, 36, 36, 38, 41,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34,
- 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37,
- 38, 38, 38, 38, 39, 40, 41, 41, 41,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
- 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
- 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
- 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
- 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
- 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
- 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
- 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
- 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
- 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
- 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
- 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
- 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
- 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
- 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
- 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
- 42, 44,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
- 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
- 33, 34, 35, 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
- 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36,
- 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36,
- 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36,
- 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32,
- 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
- 35, 35, 36, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34,
- 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
- 37, 38, 38, 38, 38, 38, 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35,
- 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40,
- 40, 40, 40, 40, 40, 40, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37,
- 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
- 41, 41, 41, 42, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 38, 39,
- 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 42, 43, 43, 43, 43, 43,
- 43, 44,
- // Size 4x16
- 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31,
- 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32,
- 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35,
- 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35,
- 35, 35, 35, 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
- // Size 8x32
- 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31,
- 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
- 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31,
- 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
- 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31,
- 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
- 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34,
- 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
- 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30,
- 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 31,
- 32, 33, 33, 33, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34,
- 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
- 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36,
- 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
- 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39,
- 42,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
- 30, 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
- 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35,
- 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36,
- 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
- 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38,
- 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37,
- 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41,
- 42},
- },
- // Quantizer level 14.
- {
- {// Luma
- // Size 4x4
- 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
- // Size 8x8
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
- 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
- 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32,
- // Size 4x8
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31,
- 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
- // Size 8x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31,
- 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
- // Size 8x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
- 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
- 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
- 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
- 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
- 32, 31, 31, 32, 32, 32, 32, 32, 32,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- // Size 4x16
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31,
- 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
- 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
- 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- // Size 8x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32,
- 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
- 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
- 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
- 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31,
- 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
- 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32,
- 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
- 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
- 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
- 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
- 32,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32},
- {// Chroma
- // Size 4x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- // Size 8x8
- 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
- // Size 16x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32,
- // Size 32x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
- 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32,
- // Size 4x8
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
- // Size 8x4
- 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
- // Size 8x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32,
- 32, 30, 31, 31, 31, 32, 32, 32, 32,
- // Size 16x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 32, 32, 32, 32,
- // Size 16x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
- 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32,
- // Size 32x16
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
- 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
- 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
- 32, 32,
- // Size 4x16
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
- // Size 16x4
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- // Size 8x32
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
- 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31,
- 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
- 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32,
- 32,
- // Size 32x8
- 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
- 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
- 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
- 32},
- },
-};
+constexpr uint8_t kQuantizerMatrix4x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][32] = {
+ {{32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84,
+ 91, 49, 71, 103, 110, 65, 84, 125, 128, 80, 97,
+ 142, 152, 91, 100, 145, 178, 104, 112, 146, 190},
+ {31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73,
+ 54, 65, 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105}},
+ {{32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77,
+ 88, 46, 67, 93, 105, 60, 79, 112, 122, 75, 92,
+ 130, 144, 86, 95, 136, 167, 98, 105, 136, 177},
+ {31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72,
+ 52, 64, 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102}},
+ {{32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68,
+ 85, 44, 61, 85, 101, 54, 69, 98, 117, 72, 84,
+ 118, 136, 82, 89, 129, 157, 92, 98, 127, 165},
+ {31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71,
+ 50, 59, 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99}},
+ {{32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65,
+ 82, 41, 53, 78, 97, 51, 61, 92, 111, 65, 73,
+ 108, 129, 75, 81, 117, 148, 86, 92, 119, 154},
+ {31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70,
+ 49, 55, 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95}},
+ {{32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87,
+ 48, 59, 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144},
+ {31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65,
+ 47, 54, 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93}},
+ {{32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84,
+ 45, 56, 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136},
+ {31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64,
+ 46, 54, 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90}},
+ {{32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68,
+ 41, 48, 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111},
+ {31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57,
+ 45, 49, 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79}},
+ {{32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59,
+ 38, 40, 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97},
+ {31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53,
+ 46, 47, 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73}},
+ {{32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54,
+ 35, 38, 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83},
+ {31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50,
+ 47, 48, 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67}},
+ {{31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42,
+ 34, 37, 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67},
+ {31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45,
+ 43, 47, 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40,
+ 33, 34, 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56},
+ {31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46,
+ 40, 44, 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55}},
+ {{31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36,
+ 32, 33, 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47,
+ 37, 39, 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36},
+ {31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40,
+ 34, 36, 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47}},
+ {{31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix4x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][64] = {
+ {{31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34,
+ 48, 73, 83, 34, 54, 78, 89, 41, 63, 90, 95, 45, 67,
+ 96, 102, 54, 75, 110, 111, 60, 79, 118, 123, 72, 90, 133,
+ 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100, 140, 173,
+ 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197},
+ {31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60,
+ 46, 53, 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74,
+ 52, 64, 82, 80, 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96,
+ 63, 69, 89, 101, 65, 68, 89, 103, 67, 70, 86, 105, 69, 72, 88, 107}},
+ {{31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33,
+ 44, 66, 81, 34, 54, 74, 86, 37, 58, 79, 92, 44, 66,
+ 90, 98, 49, 71, 99, 107, 56, 77, 107, 117, 65, 84, 119,
+ 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132, 163,
+ 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183},
+ {31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59,
+ 46, 53, 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73,
+ 51, 63, 77, 78, 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93,
+ 62, 67, 86, 98, 64, 66, 87, 100, 65, 68, 83, 102, 67, 70, 86, 103}},
+ {{31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33,
+ 40, 58, 78, 34, 47, 65, 83, 37, 54, 73, 89, 41, 58,
+ 79, 94, 46, 62, 86, 102, 53, 68, 97, 112, 60, 73, 105,
+ 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125, 154,
+ 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170},
+ {31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58,
+ 44, 51, 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71,
+ 49, 58, 73, 77, 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91,
+ 60, 66, 84, 95, 62, 64, 84, 97, 64, 66, 81, 99, 65, 68, 83, 100}},
+ {{31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33,
+ 38, 56, 76, 34, 42, 61, 81, 34, 48, 66, 85, 39, 51,
+ 74, 91, 44, 56, 82, 98, 49, 60, 90, 107, 54, 63, 95,
+ 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+ 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159},
+ {31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57,
+ 43, 50, 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70,
+ 48, 54, 70, 75, 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89,
+ 58, 61, 81, 93, 60, 63, 82, 94, 62, 64, 79, 96, 63, 66, 81, 97}},
+ {{31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32,
+ 37, 49, 71, 33, 41, 53, 74, 34, 48, 60, 80, 37, 50,
+ 65, 85, 41, 53, 71, 91, 45, 56, 76, 98, 49, 60, 82,
+ 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130,
+ 74, 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148},
+ {31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56,
+ 42, 49, 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68,
+ 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83,
+ 56, 59, 73, 87, 58, 61, 75, 90, 60, 62, 76, 92, 62, 64, 78, 94}},
+ {{31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71,
+ 33, 38, 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84,
+ 41, 53, 71, 90, 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110,
+ 60, 68, 92, 118, 65, 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136},
+ {31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56,
+ 40, 47, 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64,
+ 45, 53, 61, 67, 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78,
+ 52, 56, 70, 82, 54, 58, 72, 85, 57, 60, 75, 89, 59, 61, 75, 90}},
+ {{31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59,
+ 32, 37, 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71,
+ 39, 46, 60, 76, 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92,
+ 54, 58, 76, 98, 60, 63, 80, 105, 66, 68, 85, 111, 73, 74, 91, 118},
+ {31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51,
+ 40, 47, 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58,
+ 46, 49, 57, 61, 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71,
+ 50, 52, 63, 73, 52, 53, 64, 76, 55, 55, 66, 79, 58, 58, 68, 82}},
+ {{31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53,
+ 32, 34, 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63,
+ 37, 40, 57, 67, 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79,
+ 50, 50, 71, 86, 54, 53, 74, 90, 57, 56, 77, 93, 61, 58, 79, 97},
+ {31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49,
+ 37, 44, 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55,
+ 46, 47, 55, 58, 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65,
+ 48, 47, 61, 68, 50, 48, 62, 70, 51, 49, 63, 71, 53, 50, 64, 73}},
+ {{31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49,
+ 32, 34, 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54,
+ 35, 38, 49, 60, 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71,
+ 45, 45, 58, 75, 47, 47, 60, 77, 51, 50, 63, 82, 55, 54, 67, 87},
+ {31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48,
+ 35, 43, 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50,
+ 47, 48, 53, 54, 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61,
+ 47, 46, 55, 63, 48, 47, 55, 64, 49, 47, 56, 66, 51, 49, 57, 68}},
+ {{31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41,
+ 32, 34, 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44,
+ 34, 37, 42, 48, 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58,
+ 40, 41, 51, 60, 42, 43, 53, 63, 45, 45, 56, 66, 46, 46, 56, 67},
+ {31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45,
+ 34, 42, 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47,
+ 42, 47, 50, 49, 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56,
+ 47, 46, 53, 57, 46, 46, 53, 58, 48, 46, 54, 59, 48, 46, 54, 59}},
+ {{31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37,
+ 32, 32, 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40,
+ 32, 34, 37, 40, 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51,
+ 35, 36, 43, 51, 38, 39, 45, 54, 38, 39, 45, 54, 42, 42, 48, 58},
+ {31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45,
+ 31, 35, 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46,
+ 38, 43, 47, 46, 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53,
+ 48, 47, 50, 53, 47, 46, 50, 54, 47, 46, 50, 54, 47, 45, 49, 56}},
+ {{31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36,
+ 32, 32, 34, 37, 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41,
+ 34, 34, 37, 42, 34, 34, 37, 44, 35, 34, 38, 48, 35, 34, 38, 48},
+ {31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46,
+ 31, 32, 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47,
+ 36, 37, 44, 47, 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49,
+ 42, 43, 47, 50, 44, 44, 47, 51, 48, 46, 48, 53, 48, 46, 48, 53}},
+ {{31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35,
+ 32, 33, 34, 35, 32, 33, 34, 35, 33, 33, 35, 36, 34, 34, 36, 37},
+ {31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38,
+ 31, 32, 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40,
+ 33, 35, 40, 42, 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45,
+ 38, 40, 45, 47, 38, 40, 45, 47, 39, 41, 45, 47, 42, 43, 46, 47}},
+ {{31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33},
+ {31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35,
+ 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37,
+ 33, 35, 35, 39, 34, 36, 36, 40, 34, 36, 36, 40, 34, 36, 36, 40}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 30, 31, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x16
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][128] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59,
+ 78, 86, 93, 32, 34, 36, 50, 59, 77, 82, 89, 34, 37,
+ 42, 54, 63, 79, 80, 88, 36, 38, 48, 60, 68, 84, 86,
+ 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76,
+ 85, 102, 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65,
+ 58, 68, 92, 105, 124, 122, 124, 79, 70, 79, 104, 118, 141,
+ 135, 135, 82, 72, 81, 106, 121, 144, 149, 146, 91, 80, 88,
+ 106, 130, 148, 162, 159, 97, 86, 94, 107, 128, 157, 167, 171,
+ 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101, 117, 138,
+ 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203},
+ {32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66,
+ 33, 43, 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62,
+ 49, 48, 53, 54, 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66,
+ 50, 46, 54, 64, 67, 73, 72, 70, 54, 49, 55, 68, 73, 80, 76, 75,
+ 57, 50, 56, 70, 76, 84, 80, 79, 63, 55, 60, 75, 82, 92, 87, 84,
+ 64, 56, 61, 75, 83, 93, 93, 89, 68, 59, 64, 74, 86, 94, 98, 94,
+ 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75, 83, 92, 101, 104,
+ 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82, 91, 101, 109}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60,
+ 72, 84, 90, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 40, 47, 60, 71, 78, 85, 36, 37, 48, 56, 68, 78, 83,
+ 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69,
+ 84, 95, 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61,
+ 56, 65, 81, 100, 113, 116, 118, 71, 64, 73, 89, 111, 125,
+ 129, 129, 79, 70, 79, 95, 118, 133, 142, 138, 86, 76, 84,
+ 100, 124, 140, 153, 150, 92, 82, 89, 101, 121, 148, 157, 161,
+ 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110, 129,
+ 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188},
+ {32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65,
+ 33, 41, 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65,
+ 50, 46, 54, 61, 66, 70, 71, 69, 52, 47, 54, 63, 71, 75, 75, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 60, 53, 58, 68, 79, 85, 85, 82,
+ 63, 55, 60, 70, 82, 89, 91, 87, 66, 58, 62, 72, 84, 91, 95, 91,
+ 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73, 81, 89, 98, 101,
+ 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79, 89, 98, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54,
+ 73, 81, 88, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 38, 42, 52, 69, 76, 82, 34, 36, 44, 50, 59, 75, 81,
+ 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63,
+ 74, 90, 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57,
+ 53, 63, 74, 90, 108, 111, 113, 65, 59, 68, 79, 97, 118,
+ 123, 122, 71, 64, 73, 84, 102, 125, 135, 131, 81, 72, 80,
+ 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148, 151,
+ 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121,
+ 141, 160, 169, 103, 94, 92, 103, 119, 137, 158, 175},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65,
+ 33, 40, 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60,
+ 44, 46, 51, 51, 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64,
+ 49, 45, 53, 58, 62, 67, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72,
+ 54, 49, 55, 62, 70, 77, 77, 76, 57, 51, 56, 64, 73, 82, 83, 81,
+ 60, 53, 58, 65, 75, 85, 89, 85, 64, 57, 61, 68, 78, 89, 93, 89,
+ 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71, 79, 87, 96, 98,
+ 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86, 95, 102}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76,
+ 85, 31, 33, 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58,
+ 71, 79, 34, 35, 41, 48, 54, 63, 76, 81, 36, 36, 46, 54, 60,
+ 68, 80, 87, 41, 40, 49, 60, 67, 76, 88, 93, 47, 44, 53, 66,
+ 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106, 108, 58, 54, 61,
+ 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120, 124, 74, 67,
+ 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139, 142, 87,
+ 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150, 157,
+ 97, 88, 86, 97, 111, 128, 147, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64,
+ 33, 37, 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59,
+ 42, 44, 49, 49, 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63,
+ 48, 46, 51, 57, 59, 61, 66, 67, 50, 46, 52, 59, 63, 66, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 54, 49, 54, 62, 68, 73, 79, 79,
+ 57, 51, 55, 64, 70, 76, 83, 83, 61, 55, 58, 66, 73, 80, 87, 87,
+ 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69, 77, 84, 93, 95,
+ 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83, 92, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74,
+ 82, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58,
+ 69, 77, 33, 34, 38, 44, 52, 62, 72, 78, 36, 35, 42, 51, 58,
+ 68, 78, 84, 39, 38, 44, 54, 63, 73, 84, 89, 44, 41, 46, 59,
+ 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103, 53, 49, 53,
+ 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66, 60,
+ 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+ 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147,
+ 91, 82, 80, 90, 103, 119, 137, 151},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63,
+ 31, 35, 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58,
+ 41, 43, 48, 49, 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62,
+ 48, 46, 49, 54, 57, 60, 64, 65, 49, 45, 48, 56, 61, 64, 67, 69,
+ 50, 46, 49, 57, 63, 67, 71, 73, 52, 48, 50, 58, 65, 71, 75, 77,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 53, 61, 69, 77, 82, 85,
+ 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65, 73, 82, 89, 92,
+ 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81, 89, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69,
+ 31, 32, 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 38, 36, 40, 49, 56, 63, 69, 77,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 42, 43, 54, 64, 72, 79, 86,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 50, 60, 71, 82, 90, 99,
+ 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68, 79, 92, 102, 112,
+ 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57,
+ 30, 32, 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 48, 46, 47, 53, 55, 56, 58, 61,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 62, 64, 66,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 70, 73,
+ 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56, 64, 70, 75, 79,
+ 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 34, 34, 37, 41, 44, 48, 54, 63, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 41, 39, 41, 49, 54, 60, 67, 76,
+ 44, 41, 43, 51, 57, 63, 71, 79, 48, 45, 46, 54, 60, 67, 76, 85,
+ 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54,
+ 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 42, 43, 47, 49, 50, 49, 50, 53, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 45, 46, 51, 54, 57, 59, 61,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 64, 67,
+ 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 36, 34, 37, 40, 48, 54, 56, 60, 38, 36, 39, 41, 49, 56, 58, 63,
+ 39, 37, 40, 42, 50, 58, 60, 65, 44, 41, 42, 45, 53, 63, 66, 71,
+ 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48, 57, 67, 71, 77,
+ 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48,
+ 33, 36, 41, 44, 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 49, 46, 48, 49, 53, 53, 54, 54, 48, 46, 47, 48, 53, 55, 55, 56,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 45, 47, 53, 58, 59, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48, 54, 60, 61, 64,
+ 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47,
+ 34, 34, 35, 37, 41, 42, 48, 50, 35, 34, 36, 38, 45, 47, 52, 55,
+ 36, 34, 36, 38, 46, 48, 54, 56, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47,
+ 42, 43, 44, 47, 49, 50, 49, 50, 47, 46, 46, 48, 51, 52, 53, 53,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 33, 33, 33, 35, 36, 36, 40, 41,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 41, 42, 42, 44, 47, 47, 49, 49,
+ 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 35, 35, 35,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 35, 37, 38, 38, 41, 45, 45, 46,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 32, 33, 33, 33, 33, 36, 39,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix8x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][256] = {
+ {{32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82,
+ 88, 94, 31, 33, 34, 49, 59, 78, 86, 93, 31, 33, 35, 49,
+ 59, 78, 84, 90, 32, 34, 36, 50, 59, 77, 82, 89, 32, 35,
+ 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88,
+ 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84,
+ 86, 90, 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71,
+ 79, 95, 94, 97, 46, 44, 55, 73, 82, 98, 98, 99, 48, 46,
+ 56, 76, 85, 102, 105, 105, 53, 50, 60, 82, 92, 109, 107, 107,
+ 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66, 89, 101, 120,
+ 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73, 97,
+ 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71,
+ 80, 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146,
+ 88, 77, 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148,
+ 162, 159, 94, 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107,
+ 128, 157, 167, 171, 100, 89, 97, 111, 127, 152, 173, 182, 103, 93,
+ 98, 114, 131, 150, 174, 186, 107, 96, 100, 117, 136, 155, 177, 191,
+ 110, 100, 101, 117, 138, 161, 183, 193, 114, 104, 103, 117, 137, 159,
+ 185, 201, 118, 107, 105, 118, 136, 157, 182, 203, 122, 111, 107, 119,
+ 136, 156, 179, 204},
+ {32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67,
+ 30, 40, 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64,
+ 33, 43, 47, 47, 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62,
+ 42, 47, 50, 50, 53, 60, 59, 62, 45, 47, 51, 52, 55, 61, 61, 61,
+ 49, 48, 53, 54, 57, 62, 62, 62, 48, 47, 53, 57, 60, 66, 65, 64,
+ 49, 46, 53, 61, 64, 69, 66, 66, 49, 46, 53, 62, 65, 71, 68, 67,
+ 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66, 71, 77, 73, 71,
+ 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82, 79, 76,
+ 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86,
+ 64, 56, 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91,
+ 68, 59, 64, 74, 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97,
+ 70, 62, 66, 73, 83, 96, 99, 98, 71, 63, 67, 74, 82, 93, 102, 102,
+ 72, 64, 66, 75, 83, 92, 101, 104, 73, 65, 66, 75, 84, 93, 102, 106,
+ 74, 67, 66, 74, 84, 94, 103, 106, 75, 68, 66, 74, 83, 93, 103, 109,
+ 76, 69, 67, 73, 82, 91, 101, 109, 77, 70, 67, 73, 81, 90, 99, 108}},
+ {{32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75,
+ 86, 91, 31, 32, 35, 44, 60, 72, 84, 90, 31, 33, 35, 44,
+ 59, 71, 82, 87, 32, 34, 36, 45, 59, 71, 80, 87, 32, 35,
+ 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71, 78, 85,
+ 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78,
+ 83, 87, 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60,
+ 73, 84, 91, 94, 44, 42, 53, 66, 79, 90, 94, 96, 47, 45,
+ 56, 69, 84, 95, 101, 101, 49, 47, 57, 71, 86, 97, 103, 102,
+ 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79, 98, 110,
+ 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+ 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68,
+ 76, 92, 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138,
+ 82, 73, 81, 97, 121, 136, 145, 144, 86, 76, 84, 100, 124, 140,
+ 153, 150, 89, 79, 87, 99, 124, 145, 156, 156, 92, 82, 89, 101,
+ 121, 148, 157, 161, 95, 85, 92, 105, 120, 143, 163, 171, 98, 88,
+ 93, 108, 124, 141, 163, 174, 101, 91, 94, 110, 128, 146, 166, 179,
+ 104, 94, 95, 110, 129, 151, 171, 181, 107, 97, 96, 110, 128, 149,
+ 173, 188, 110, 100, 98, 111, 127, 147, 169, 188, 114, 104, 100, 111,
+ 127, 145, 166, 190},
+ {32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66,
+ 30, 38, 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63,
+ 33, 41, 47, 46, 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62,
+ 39, 46, 48, 47, 51, 55, 58, 61, 42, 46, 50, 50, 53, 57, 60, 60,
+ 49, 48, 53, 54, 57, 60, 61, 61, 48, 47, 53, 55, 58, 62, 64, 63,
+ 48, 46, 53, 56, 60, 64, 65, 65, 49, 45, 53, 59, 64, 67, 67, 66,
+ 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61, 68, 71, 72, 70,
+ 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78, 78, 74,
+ 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+ 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84,
+ 63, 55, 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89,
+ 66, 58, 62, 72, 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94,
+ 68, 60, 64, 71, 81, 94, 97, 96, 69, 61, 65, 72, 80, 91, 99, 100,
+ 70, 62, 65, 73, 81, 89, 98, 101, 71, 64, 65, 73, 82, 90, 99, 103,
+ 72, 65, 65, 72, 82, 92, 100, 103, 73, 66, 65, 72, 81, 90, 100, 105,
+ 74, 67, 65, 71, 79, 89, 98, 105, 75, 68, 65, 71, 78, 87, 96, 105}},
+ {{32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75,
+ 83, 88, 31, 32, 35, 41, 54, 73, 81, 88, 31, 32, 34, 41,
+ 53, 72, 79, 84, 32, 33, 36, 42, 53, 71, 78, 84, 32, 34,
+ 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69, 76, 82,
+ 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75,
+ 81, 84, 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58,
+ 68, 84, 88, 90, 40, 40, 51, 59, 70, 85, 91, 92, 44, 42,
+ 53, 63, 74, 90, 97, 97, 47, 45, 56, 66, 79, 95, 99, 98,
+ 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86, 103,
+ 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75,
+ 91, 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62,
+ 71, 83, 100, 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131,
+ 79, 71, 79, 90, 109, 133, 137, 136, 81, 72, 80, 91, 110, 135,
+ 145, 141, 82, 73, 81, 92, 111, 136, 147, 147, 87, 77, 85, 96,
+ 114, 140, 148, 151, 90, 80, 87, 99, 113, 135, 153, 160, 92, 83,
+ 88, 102, 117, 133, 153, 163, 95, 85, 88, 103, 120, 137, 155, 168,
+ 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90, 103, 120, 139,
+ 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97, 93, 104,
+ 118, 135, 155, 176},
+ {32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65,
+ 31, 36, 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62,
+ 33, 40, 47, 46, 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61,
+ 37, 44, 47, 45, 48, 54, 57, 60, 42, 45, 50, 49, 51, 57, 59, 59,
+ 44, 46, 51, 51, 53, 59, 60, 61, 49, 47, 53, 53, 55, 60, 63, 62,
+ 48, 46, 53, 56, 58, 64, 64, 64, 48, 46, 53, 56, 59, 65, 66, 65,
+ 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59, 65, 70, 70, 68,
+ 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75, 76, 73,
+ 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+ 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82,
+ 60, 53, 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87,
+ 64, 57, 61, 68, 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92,
+ 66, 59, 63, 69, 79, 91, 94, 93, 67, 60, 63, 70, 78, 88, 96, 97,
+ 68, 61, 63, 71, 79, 87, 96, 98, 69, 62, 63, 71, 80, 88, 96, 100,
+ 70, 63, 63, 70, 80, 89, 97, 100, 71, 64, 63, 70, 78, 88, 97, 102,
+ 72, 65, 63, 69, 77, 86, 95, 102, 73, 66, 63, 69, 76, 84, 93, 101}},
+ {{32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62,
+ 78, 86, 31, 32, 34, 41, 50, 61, 76, 85, 31, 32, 34, 41,
+ 49, 59, 74, 82, 31, 33, 35, 42, 49, 59, 73, 81, 32, 33,
+ 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58, 71, 79,
+ 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63,
+ 76, 81, 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54,
+ 60, 68, 80, 87, 39, 39, 48, 58, 65, 73, 86, 88, 41, 40,
+ 49, 60, 67, 76, 88, 93, 44, 42, 51, 63, 71, 79, 92, 94,
+ 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+ 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72,
+ 84, 94, 108, 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58,
+ 65, 78, 91, 103, 118, 119, 65, 59, 66, 79, 92, 105, 120, 124,
+ 71, 64, 71, 84, 97, 111, 127, 129, 74, 67, 73, 86, 100, 113,
+ 131, 134, 79, 71, 77, 90, 104, 118, 136, 139, 82, 73, 79, 92,
+ 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139, 150, 87, 78,
+ 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145, 157,
+ 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130,
+ 151, 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97,
+ 110, 126, 144, 163},
+ {32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64,
+ 31, 34, 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61,
+ 33, 37, 46, 45, 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60,
+ 37, 43, 47, 45, 47, 50, 55, 59, 39, 43, 48, 47, 48, 51, 56, 58,
+ 42, 44, 49, 49, 50, 53, 58, 60, 47, 46, 51, 53, 53, 56, 61, 61,
+ 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 56, 57, 60, 64, 64,
+ 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58, 61, 64, 68, 67,
+ 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67, 71, 71,
+ 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+ 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80,
+ 57, 51, 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85,
+ 61, 55, 58, 66, 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89,
+ 64, 57, 60, 68, 75, 83, 91, 91, 64, 58, 60, 68, 75, 83, 91, 94,
+ 66, 59, 61, 69, 77, 84, 93, 95, 67, 60, 61, 69, 78, 85, 93, 97,
+ 68, 61, 61, 68, 77, 86, 94, 97, 69, 62, 61, 68, 76, 85, 94, 99,
+ 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67, 74, 82, 90, 98}},
+ {{32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62,
+ 75, 83, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32, 33, 38,
+ 47, 59, 72, 79, 31, 32, 34, 38, 47, 59, 71, 79, 32, 33,
+ 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58, 69, 77,
+ 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62,
+ 72, 78, 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51,
+ 58, 68, 78, 84, 36, 35, 42, 51, 59, 68, 79, 85, 39, 38,
+ 44, 54, 63, 73, 84, 89, 40, 39, 45, 56, 65, 75, 85, 90,
+ 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+ 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65,
+ 78, 90, 101, 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53,
+ 57, 69, 83, 97, 109, 113, 58, 54, 57, 70, 84, 98, 110, 118,
+ 65, 59, 62, 74, 89, 105, 118, 122, 66, 60, 63, 75, 90, 106,
+ 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74, 67, 69, 81,
+ 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81, 73,
+ 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147,
+ 86, 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122,
+ 140, 152, 91, 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90,
+ 103, 117, 134, 152},
+ {32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63,
+ 31, 33, 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60,
+ 31, 35, 43, 46, 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59,
+ 35, 39, 46, 46, 47, 50, 55, 58, 37, 41, 47, 46, 46, 50, 54, 57,
+ 41, 43, 48, 49, 49, 52, 57, 59, 42, 43, 48, 49, 50, 53, 57, 60,
+ 49, 47, 50, 53, 54, 57, 60, 62, 49, 47, 50, 53, 54, 57, 61, 63,
+ 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55, 58, 61, 65, 66,
+ 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65, 69, 70,
+ 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78,
+ 54, 50, 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83,
+ 57, 52, 53, 61, 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87,
+ 61, 55, 56, 63, 72, 80, 86, 88, 63, 57, 57, 64, 73, 82, 89, 92,
+ 64, 58, 58, 65, 73, 82, 89, 92, 64, 58, 58, 65, 74, 83, 90, 94,
+ 66, 59, 59, 66, 75, 84, 91, 94, 67, 60, 59, 66, 74, 82, 91, 96,
+ 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65, 71, 79, 87, 95}},
+ {{32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75,
+ 31, 32, 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72,
+ 31, 32, 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71,
+ 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38, 42, 49, 58, 69,
+ 32, 33, 35, 38, 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73,
+ 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60, 68, 78,
+ 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50, 58, 65, 73, 84,
+ 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+ 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103,
+ 53, 49, 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110,
+ 58, 54, 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118,
+ 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125,
+ 71, 65, 63, 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133,
+ 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136,
+ 82, 75, 72, 81, 92, 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141},
+ {32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60,
+ 31, 31, 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57,
+ 30, 32, 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56,
+ 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47, 45, 47, 50, 54,
+ 37, 40, 47, 47, 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57,
+ 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54, 57, 60,
+ 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53, 56, 57, 60, 64,
+ 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+ 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75,
+ 52, 48, 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78,
+ 54, 50, 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82,
+ 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58, 65, 72, 79, 85,
+ 60, 54, 52, 58, 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89,
+ 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75, 83, 90,
+ 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63, 69, 77, 84, 92}},
+ {{32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70,
+ 31, 32, 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67,
+ 31, 32, 33, 34, 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66,
+ 32, 32, 34, 36, 42, 50, 57, 65, 32, 32, 34, 37, 42, 49, 56, 65,
+ 32, 33, 35, 38, 42, 49, 56, 64, 32, 33, 35, 39, 43, 50, 56, 64,
+ 34, 34, 37, 42, 48, 54, 61, 69, 34, 34, 37, 42, 48, 54, 61, 69,
+ 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48, 54, 60, 66, 74,
+ 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65, 71, 79,
+ 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91,
+ 48, 45, 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98,
+ 53, 49, 50, 60, 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103,
+ 58, 54, 54, 63, 75, 87, 95, 105, 61, 57, 56, 66, 77, 89, 98, 108,
+ 65, 60, 58, 68, 79, 92, 102, 112, 67, 61, 60, 69, 81, 94, 103, 114,
+ 71, 65, 63, 73, 84, 97, 108, 119, 72, 66, 64, 73, 85, 98, 108, 119,
+ 79, 72, 70, 79, 90, 104, 115, 127, 79, 72, 70, 79, 90, 104, 115, 127},
+ {32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58,
+ 31, 31, 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56,
+ 30, 32, 40, 46, 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54,
+ 33, 36, 43, 47, 46, 47, 50, 54, 34, 37, 44, 47, 45, 47, 50, 53,
+ 37, 40, 47, 47, 45, 47, 49, 52, 37, 40, 47, 48, 46, 47, 49, 53,
+ 42, 43, 47, 50, 49, 50, 53, 56, 42, 43, 47, 50, 49, 50, 53, 56,
+ 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53, 53, 54, 56, 59,
+ 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57, 59, 62,
+ 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69,
+ 50, 46, 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72,
+ 52, 48, 47, 54, 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75,
+ 54, 50, 49, 55, 62, 68, 72, 76, 55, 51, 49, 56, 63, 69, 74, 78,
+ 57, 52, 50, 56, 64, 70, 75, 79, 58, 53, 51, 57, 64, 71, 76, 80,
+ 60, 54, 52, 58, 65, 72, 77, 82, 60, 55, 53, 59, 65, 73, 78, 83,
+ 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60, 67, 75, 80, 86}},
+ {{32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63,
+ 31, 32, 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61,
+ 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59,
+ 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36, 38, 42, 50, 59,
+ 32, 32, 34, 36, 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58,
+ 32, 33, 35, 37, 40, 42, 49, 58, 33, 33, 36, 40, 43, 46, 53, 62,
+ 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43, 46, 50, 56, 65,
+ 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54, 60, 68,
+ 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79,
+ 44, 41, 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84,
+ 48, 45, 46, 54, 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88,
+ 53, 49, 50, 57, 64, 71, 82, 92, 53, 49, 50, 57, 64, 71, 82, 92,
+ 57, 53, 53, 60, 67, 74, 86, 97, 58, 54, 54, 61, 68, 75, 87, 98,
+ 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79, 92, 105,
+ 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62, 70, 76, 83, 96, 109},
+ {32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55,
+ 31, 31, 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53,
+ 30, 32, 40, 44, 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52,
+ 33, 35, 42, 46, 46, 45, 47, 51, 33, 36, 43, 46, 46, 46, 47, 51,
+ 35, 37, 44, 46, 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50,
+ 37, 40, 47, 47, 47, 45, 47, 50, 41, 42, 47, 49, 49, 48, 50, 52,
+ 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50, 51, 51, 52, 54,
+ 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53, 54, 57,
+ 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64,
+ 49, 45, 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66,
+ 50, 46, 46, 52, 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68,
+ 52, 48, 47, 53, 57, 61, 66, 71, 52, 48, 47, 53, 57, 61, 66, 71,
+ 54, 49, 48, 54, 58, 62, 68, 73, 54, 50, 49, 54, 58, 62, 68, 73,
+ 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64, 70, 76,
+ 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57, 61, 65, 72, 78}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52,
+ 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51,
+ 31, 32, 32, 33, 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49,
+ 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 33, 34, 36, 42, 45, 49,
+ 32, 32, 34, 34, 36, 42, 45, 50, 32, 32, 34, 35, 37, 42, 45, 49,
+ 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49,
+ 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38, 42, 48, 50, 54,
+ 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50, 53, 57,
+ 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65,
+ 39, 37, 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69,
+ 44, 41, 42, 45, 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72,
+ 47, 44, 45, 47, 56, 66, 69, 75, 48, 45, 46, 48, 56, 67, 70, 76,
+ 49, 46, 47, 48, 57, 67, 71, 77, 53, 49, 49, 51, 59, 71, 74, 81,
+ 53, 49, 50, 51, 60, 71, 75, 82, 55, 51, 51, 53, 61, 72, 76, 83,
+ 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55, 63, 75, 79, 87},
+ {32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50,
+ 31, 31, 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50,
+ 30, 32, 38, 40, 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48,
+ 31, 33, 38, 41, 46, 45, 46, 48, 33, 35, 41, 43, 47, 45, 46, 47,
+ 33, 36, 41, 44, 47, 46, 46, 47, 34, 37, 42, 45, 47, 45, 46, 47,
+ 37, 40, 45, 47, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47,
+ 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48, 50, 49, 50, 50,
+ 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51, 52, 52,
+ 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57,
+ 48, 46, 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60,
+ 49, 45, 45, 47, 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61,
+ 50, 46, 46, 48, 54, 59, 61, 63, 50, 46, 46, 48, 54, 59, 61, 64,
+ 51, 47, 47, 48, 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66,
+ 52, 48, 47, 48, 54, 61, 63, 66, 53, 48, 48, 49, 54, 61, 63, 67,
+ 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50, 55, 62, 65, 68}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46,
+ 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45,
+ 31, 32, 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44,
+ 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 35, 41, 44,
+ 31, 32, 33, 34, 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45,
+ 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 35, 37, 37, 42, 45,
+ 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45,
+ 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+ 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56,
+ 36, 34, 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58,
+ 39, 37, 39, 40, 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60,
+ 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43, 51, 53, 63, 66,
+ 44, 41, 42, 43, 51, 53, 63, 66, 44, 42, 42, 43, 51, 54, 64, 67,
+ 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56, 67, 70,
+ 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48, 56, 58, 69, 73},
+ {32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49,
+ 31, 31, 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48,
+ 31, 32, 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46,
+ 30, 32, 35, 40, 44, 46, 45, 46, 31, 33, 35, 40, 45, 46, 45, 46,
+ 33, 35, 37, 42, 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46,
+ 33, 36, 38, 43, 46, 47, 46, 46, 35, 38, 41, 45, 47, 47, 45, 46,
+ 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46,
+ 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+ 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54,
+ 49, 46, 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55,
+ 48, 46, 46, 47, 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56,
+ 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46, 51, 53, 58, 59,
+ 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 52, 53, 58, 60,
+ 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61,
+ 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47, 52, 54, 60, 62}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42,
+ 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 35, 35, 41,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42,
+ 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 37, 37, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48,
+ 34, 34, 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50,
+ 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54,
+ 36, 34, 34, 38, 38, 48, 48, 54, 37, 36, 36, 39, 39, 49, 49, 56,
+ 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 41, 39, 39, 42, 42, 52, 52, 60,
+ 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43, 43, 53, 53, 63},
+ {32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47,
+ 31, 31, 31, 38, 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46,
+ 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+ 30, 32, 32, 40, 40, 46, 46, 45, 32, 34, 34, 41, 41, 46, 46, 45,
+ 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46,
+ 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45, 45, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+ 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49,
+ 42, 43, 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51,
+ 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53,
+ 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47, 47, 53, 53, 54,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 45, 45, 46, 46, 53, 53, 57,
+ 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46, 46, 53, 53, 58}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35,
+ 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 32, 34, 35,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 33, 33, 35, 35,
+ 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36,
+ 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40,
+ 33, 33, 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 41, 42,
+ 34, 34, 34, 35, 37, 37, 43, 44, 35, 34, 34, 36, 38, 38, 45, 47,
+ 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48,
+ 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37, 39, 39, 46, 49},
+ {32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47,
+ 31, 31, 32, 34, 39, 39, 45, 46, 30, 31, 32, 34, 39, 39, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 31, 33, 33, 36, 41, 41, 45, 46,
+ 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47,
+ 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48,
+ 41, 42, 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50,
+ 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47, 49, 50,
+ 44, 44, 44, 45, 47, 47, 50, 51, 47, 46, 46, 46, 48, 48, 51, 52,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53,
+ 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 47, 47, 52, 53}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33,
+ 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33, 33, 34,
+ 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35,
+ 32, 32, 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 33, 33, 33, 34, 36, 36, 36, 33, 33, 33, 33, 34, 36, 36, 37,
+ 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34, 35, 37, 37, 38},
+ {32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39,
+ 31, 31, 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40,
+ 31, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 34, 39, 39, 40,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41,
+ 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41, 41, 42,
+ 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45,
+ 35, 37, 38, 38, 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 39, 40, 41, 41, 43, 47, 47, 47, 40, 41, 42, 42, 44, 47, 47, 47,
+ 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34},
+ {32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35,
+ 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36,
+ 31, 31, 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37,
+ 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38,
+ 31, 32, 33, 33, 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40,
+ 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36, 36, 36, 39, 42}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32,
+ 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32}}};
+constexpr uint8_t kQuantizerMatrix16x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][512] = {
+ {{32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96,
+ 99, 102, 31, 32, 32, 34, 35, 42, 51, 56, 62, 75, 82, 85,
+ 88, 91, 94, 97, 31, 32, 33, 33, 34, 41, 49, 54, 59, 72,
+ 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41, 49, 54,
+ 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42,
+ 50, 54, 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37,
+ 38, 42, 49, 53, 58, 69, 75, 78, 82, 86, 89, 92, 34, 34,
+ 37, 39, 42, 48, 54, 58, 63, 73, 79, 78, 80, 83, 88, 92,
+ 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+ 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87,
+ 86, 89, 90, 90, 39, 37, 40, 45, 50, 58, 65, 69, 73, 84,
+ 89, 89, 91, 91, 93, 96, 44, 41, 43, 48, 53, 63, 71, 75,
+ 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49, 55, 65,
+ 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51,
+ 56, 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49,
+ 50, 54, 60, 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111,
+ 58, 54, 54, 58, 63, 75, 87, 92, 98, 110, 116, 115, 112, 111,
+ 115, 112, 61, 57, 56, 60, 66, 77, 89, 95, 101, 114, 120, 118,
+ 119, 118, 116, 120, 65, 60, 58, 63, 68, 79, 92, 98, 105, 118,
+ 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73, 84, 97, 103,
+ 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79, 90,
+ 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+ 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75,
+ 72, 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143,
+ 88, 80, 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152,
+ 152, 154, 91, 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155,
+ 162, 160, 159, 155, 94, 85, 83, 82, 91, 100, 105, 118, 131, 137,
+ 153, 160, 165, 167, 166, 168, 97, 88, 86, 85, 94, 100, 107, 123,
+ 128, 140, 157, 161, 167, 173, 171, 169, 100, 91, 89, 87, 97, 100,
+ 111, 121, 127, 145, 152, 164, 173, 178, 182, 181, 103, 94, 93, 90,
+ 98, 101, 114, 120, 131, 144, 150, 170, 174, 180, 186, 183, 107, 97,
+ 96, 93, 100, 104, 117, 119, 136, 142, 155, 168, 177, 187, 191, 198,
+ 110, 101, 100, 97, 101, 108, 117, 123, 138, 141, 161, 165, 183, 188,
+ 193, 200, 114, 104, 104, 100, 103, 112, 117, 127, 137, 146, 159, 167,
+ 185, 190, 201, 206, 118, 108, 107, 103, 105, 115, 118, 131, 136, 151,
+ 157, 172, 182, 197, 203, 208, 122, 111, 111, 107, 107, 119, 119, 136,
+ 136, 156, 156, 178, 179, 203, 204, 217},
+ {32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72,
+ 31, 31, 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68,
+ 30, 32, 40, 42, 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68,
+ 32, 34, 41, 44, 46, 45, 48, 49, 51, 57, 59, 61, 62, 63, 64, 65,
+ 33, 36, 43, 45, 47, 46, 47, 49, 51, 56, 59, 60, 60, 62, 63, 65,
+ 37, 40, 47, 47, 47, 45, 47, 48, 50, 54, 57, 58, 60, 61, 62, 63,
+ 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58, 59, 60, 62, 63,
+ 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60, 61, 61,
+ 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+ 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65,
+ 49, 45, 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65,
+ 49, 46, 46, 49, 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68,
+ 50, 46, 46, 50, 54, 59, 64, 65, 67, 71, 73, 72, 72, 70, 70, 69,
+ 52, 48, 47, 50, 54, 61, 66, 68, 71, 75, 77, 74, 73, 73, 71, 72,
+ 54, 50, 49, 52, 55, 62, 68, 71, 73, 78, 80, 78, 76, 74, 75, 73,
+ 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80, 79, 78, 76, 77,
+ 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80, 79, 77,
+ 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+ 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81,
+ 64, 58, 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86,
+ 64, 59, 56, 58, 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87,
+ 67, 61, 58, 60, 63, 69, 76, 79, 85, 92, 95, 96, 94, 92, 91, 91,
+ 68, 62, 59, 60, 64, 71, 74, 78, 86, 91, 94, 96, 98, 96, 94, 91,
+ 69, 62, 60, 60, 65, 70, 72, 79, 85, 88, 95, 98, 99, 98, 97, 96,
+ 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97, 99, 101, 98, 97,
+ 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102, 102, 102, 101,
+ 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101, 103, 104, 102,
+ 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105, 106, 107,
+ 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106, 107,
+ 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+ 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110,
+ 77, 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113}},
+ {{32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93,
+ 96, 99, 31, 32, 32, 33, 35, 42, 45, 51, 62, 69, 75, 83,
+ 86, 88, 91, 94, 31, 32, 32, 33, 35, 41, 44, 49, 60, 67,
+ 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41, 44, 49,
+ 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42,
+ 45, 50, 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36,
+ 38, 42, 45, 49, 58, 64, 69, 76, 80, 83, 86, 88, 32, 33,
+ 35, 36, 40, 44, 47, 51, 60, 66, 71, 76, 78, 81, 85, 89,
+ 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+ 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84,
+ 83, 86, 87, 87, 38, 36, 39, 41, 49, 56, 58, 63, 71, 77,
+ 81, 86, 88, 88, 90, 93, 39, 37, 40, 42, 50, 58, 60, 65,
+ 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45, 53, 63,
+ 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47,
+ 56, 66, 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46,
+ 47, 48, 57, 67, 71, 77, 86, 93, 97, 103, 103, 105, 102, 106,
+ 53, 49, 50, 51, 60, 71, 75, 82, 92, 99, 103, 111, 108, 107,
+ 110, 107, 58, 54, 54, 55, 63, 75, 79, 87, 98, 105, 110, 114,
+ 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89, 100, 107,
+ 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+ 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84,
+ 89, 97, 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69,
+ 76, 88, 92, 101, 115, 123, 130, 134, 134, 131, 132, 135, 79, 72,
+ 70, 71, 79, 90, 95, 104, 118, 127, 133, 143, 142, 141, 138, 136,
+ 82, 75, 73, 74, 81, 92, 97, 106, 121, 130, 136, 146, 145, 144,
+ 144, 145, 86, 78, 76, 77, 84, 95, 100, 109, 124, 133, 140, 147,
+ 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99, 112, 124, 130,
+ 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95, 101, 116,
+ 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92, 95,
+ 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+ 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92,
+ 91, 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185,
+ 104, 95, 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175,
+ 181, 186, 107, 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156,
+ 173, 177, 188, 192, 110, 101, 100, 97, 98, 108, 111, 123, 127, 141,
+ 147, 161, 169, 183, 188, 193, 114, 104, 104, 100, 100, 111, 111, 126,
+ 127, 145, 145, 166, 166, 189, 190, 201},
+ {32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71,
+ 31, 31, 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67,
+ 30, 32, 38, 40, 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67,
+ 31, 33, 38, 41, 46, 45, 46, 48, 52, 55, 57, 60, 61, 62, 63, 64,
+ 33, 36, 41, 44, 47, 46, 46, 47, 51, 54, 56, 59, 60, 61, 63, 64,
+ 37, 40, 45, 47, 47, 45, 46, 47, 50, 52, 54, 57, 59, 61, 62, 62,
+ 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57, 58, 59, 61, 62,
+ 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59, 60, 60,
+ 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+ 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64,
+ 48, 46, 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64,
+ 49, 45, 45, 47, 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67,
+ 50, 46, 46, 48, 54, 59, 61, 63, 66, 68, 70, 71, 71, 68, 69, 67,
+ 51, 47, 47, 48, 54, 60, 61, 64, 68, 70, 71, 73, 72, 72, 70, 71,
+ 52, 48, 47, 48, 54, 61, 63, 66, 71, 73, 75, 77, 75, 73, 74, 71,
+ 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79, 78, 76, 74, 75,
+ 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78, 78, 75,
+ 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+ 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80,
+ 62, 56, 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84,
+ 63, 57, 55, 56, 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84,
+ 64, 59, 56, 57, 61, 68, 71, 75, 83, 87, 90, 93, 92, 90, 89, 89,
+ 66, 60, 58, 58, 62, 69, 72, 76, 84, 88, 91, 94, 95, 93, 91, 89,
+ 67, 61, 59, 58, 63, 68, 71, 78, 83, 86, 93, 96, 96, 96, 94, 94,
+ 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95, 97, 98, 96, 94,
+ 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99, 100, 98,
+ 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101, 99,
+ 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+ 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104,
+ 73, 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106,
+ 74, 67, 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106,
+ 75, 68, 68, 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109}},
+ {{32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90,
+ 93, 96, 31, 32, 32, 32, 35, 38, 42, 51, 55, 62, 75, 77,
+ 83, 86, 88, 91, 31, 32, 32, 32, 35, 38, 41, 50, 54, 60,
+ 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37, 41, 49,
+ 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39,
+ 42, 50, 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34,
+ 37, 40, 42, 49, 53, 58, 70, 71, 77, 80, 83, 85, 32, 33,
+ 34, 35, 38, 40, 42, 49, 52, 58, 69, 70, 76, 78, 82, 86,
+ 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+ 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77,
+ 81, 83, 84, 84, 36, 34, 37, 38, 48, 51, 54, 60, 63, 68,
+ 78, 80, 85, 85, 86, 89, 39, 37, 39, 40, 50, 54, 58, 65,
+ 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41, 51, 55,
+ 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43,
+ 53, 58, 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44,
+ 45, 46, 56, 61, 66, 75, 79, 85, 95, 97, 99, 101, 98, 102,
+ 49, 46, 46, 47, 57, 62, 67, 77, 81, 86, 97, 99, 104, 102,
+ 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92, 103, 105,
+ 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+ 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87,
+ 91, 98, 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73,
+ 79, 92, 97, 105, 118, 119, 123, 123, 122, 119, 69, 63, 62, 62,
+ 71, 76, 83, 96, 100, 109, 122, 124, 127, 125, 125, 128, 71, 65,
+ 64, 63, 73, 78, 84, 97, 102, 111, 125, 127, 135, 134, 131, 129,
+ 79, 72, 71, 70, 79, 84, 90, 104, 109, 118, 133, 135, 137, 136,
+ 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120, 135, 137,
+ 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+ 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110,
+ 114, 125, 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89,
+ 99, 108, 113, 129, 135, 146, 153, 157, 160, 159, 92, 84, 83, 81,
+ 88, 90, 102, 106, 117, 128, 133, 150, 153, 158, 163, 160, 95, 87,
+ 85, 83, 88, 92, 103, 105, 120, 125, 137, 148, 155, 164, 168, 173,
+ 98, 89, 88, 85, 89, 95, 103, 108, 121, 124, 141, 144, 160, 164,
+ 169, 174, 100, 92, 91, 88, 90, 98, 103, 111, 120, 127, 139, 146,
+ 161, 165, 175, 179, 103, 94, 94, 90, 92, 101, 103, 114, 119, 131,
+ 137, 150, 158, 170, 175, 180, 106, 97, 97, 93, 93, 104, 104, 118,
+ 118, 135, 135, 154, 155, 175, 176, 187},
+ {32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69,
+ 31, 31, 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66,
+ 31, 32, 36, 39, 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66,
+ 30, 32, 36, 40, 46, 45, 45, 48, 49, 52, 57, 58, 60, 61, 62, 63,
+ 33, 36, 40, 43, 47, 46, 46, 47, 49, 51, 56, 57, 59, 60, 62, 63,
+ 35, 38, 42, 45, 47, 46, 45, 47, 48, 50, 55, 56, 58, 60, 61, 61,
+ 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55, 57, 58, 60, 61,
+ 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58, 59, 59,
+ 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+ 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63,
+ 48, 46, 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63,
+ 48, 45, 46, 46, 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66,
+ 49, 45, 45, 46, 53, 56, 58, 61, 62, 64, 67, 68, 70, 67, 68, 66,
+ 50, 46, 46, 46, 54, 56, 59, 63, 65, 66, 70, 71, 70, 71, 68, 70,
+ 51, 47, 47, 47, 54, 57, 60, 64, 65, 68, 71, 72, 73, 71, 72, 70,
+ 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75, 76, 75, 73, 73,
+ 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77, 76, 74,
+ 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+ 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78,
+ 59, 54, 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82,
+ 60, 54, 53, 52, 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82,
+ 63, 57, 56, 55, 60, 64, 67, 75, 77, 82, 89, 90, 90, 88, 87, 86,
+ 64, 58, 57, 55, 61, 64, 68, 75, 78, 82, 89, 90, 93, 91, 89, 87,
+ 64, 59, 57, 56, 61, 65, 68, 75, 78, 83, 90, 91, 94, 93, 92, 91,
+ 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93, 94, 95, 93, 91,
+ 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97, 97, 95,
+ 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+ 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101,
+ 70, 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101,
+ 71, 65, 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103,
+ 72, 65, 65, 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103,
+ 73, 66, 66, 63, 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105}},
+ {{32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88,
+ 90, 93, 31, 32, 32, 32, 35, 35, 42, 45, 51, 59, 62, 75,
+ 78, 83, 86, 88, 31, 32, 32, 32, 34, 35, 41, 45, 50, 58,
+ 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34, 41, 44,
+ 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36,
+ 42, 44, 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34,
+ 36, 36, 42, 45, 50, 57, 59, 71, 73, 78, 80, 82, 32, 33,
+ 34, 35, 37, 38, 42, 45, 49, 56, 58, 69, 71, 76, 79, 83,
+ 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+ 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73,
+ 76, 81, 81, 80, 35, 34, 36, 38, 45, 47, 52, 55, 59, 65,
+ 67, 77, 79, 82, 83, 86, 36, 34, 36, 38, 46, 48, 54, 56,
+ 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40, 48, 50,
+ 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41,
+ 49, 51, 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41,
+ 42, 43, 51, 53, 63, 66, 71, 78, 79, 90, 92, 97, 94, 97,
+ 47, 44, 44, 45, 53, 56, 66, 69, 75, 82, 84, 95, 97, 98,
+ 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+ 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90,
+ 92, 103, 106, 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77,
+ 84, 92, 94, 106, 108, 111, 110, 112, 58, 54, 54, 54, 61, 63,
+ 75, 79, 87, 95, 98, 110, 112, 117, 116, 113, 63, 58, 58, 57,
+ 65, 67, 78, 83, 91, 100, 103, 116, 118, 119, 119, 121, 65, 60,
+ 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120, 127, 124, 122,
+ 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127, 129,
+ 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128,
+ 131, 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115,
+ 118, 133, 136, 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97,
+ 105, 117, 120, 136, 139, 145, 142, 140, 82, 75, 74, 72, 79, 81,
+ 92, 97, 106, 117, 121, 136, 139, 148, 150, 149, 87, 79, 78, 76,
+ 83, 85, 96, 100, 110, 120, 125, 141, 144, 148, 153, 150, 89, 82,
+ 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145, 153, 157, 161,
+ 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150, 153,
+ 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136,
+ 151, 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123,
+ 128, 140, 147, 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110,
+ 110, 126, 126, 144, 144, 163, 163, 173},
+ {32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31,
+ 31, 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32,
+ 34, 39, 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35,
+ 40, 44, 46, 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42,
+ 46, 47, 45, 46, 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46,
+ 47, 46, 46, 47, 50, 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47,
+ 45, 46, 47, 49, 50, 54, 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47,
+ 47, 48, 50, 51, 55, 56, 57, 58, 59, 42, 43, 44, 47, 49, 50, 49, 50,
+ 50, 53, 53, 57, 58, 60, 60, 59, 47, 46, 46, 48, 51, 52, 53, 53, 53,
+ 55, 56, 60, 61, 61, 61, 62, 49, 46, 47, 48, 52, 53, 53, 54, 54, 56,
+ 57, 60, 61, 63, 63, 62, 48, 46, 46, 47, 51, 53, 56, 56, 57, 59, 60,
+ 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53, 57, 57, 59, 61, 61, 65,
+ 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59, 61, 63, 64, 67, 68,
+ 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65, 66, 70, 71, 70,
+ 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71, 71, 73, 71,
+ 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75, 74, 72,
+ 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76, 54,
+ 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+ 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51,
+ 50, 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52,
+ 57, 58, 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58,
+ 59, 66, 69, 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60,
+ 67, 70, 75, 80, 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68,
+ 71, 75, 81, 83, 90, 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71,
+ 75, 81, 83, 90, 91, 94, 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77,
+ 82, 84, 92, 93, 94, 95, 93, 67, 61, 60, 58, 61, 63, 69, 70, 78, 80,
+ 85, 90, 93, 96, 97, 97, 68, 62, 61, 59, 61, 64, 68, 71, 77, 79, 86,
+ 88, 94, 96, 97, 98, 69, 63, 62, 59, 61, 65, 68, 72, 76, 80, 85, 88,
+ 94, 95, 99, 99, 70, 63, 63, 60, 61, 66, 67, 73, 75, 81, 83, 89, 92,
+ 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74, 74, 82, 82, 90, 90, 98,
+ 98, 102}},
+ {{32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81,
+ 87, 90, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51, 62, 63,
+ 75, 77, 83, 85, 31, 32, 32, 32, 33, 35, 39, 42, 49, 51,
+ 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34, 38, 41,
+ 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35,
+ 38, 41, 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34,
+ 35, 36, 39, 42, 48, 50, 59, 60, 71, 72, 78, 80, 32, 32,
+ 33, 35, 36, 37, 40, 42, 48, 49, 58, 59, 69, 71, 77, 80,
+ 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+ 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63,
+ 72, 74, 78, 78, 34, 34, 34, 37, 39, 42, 45, 48, 53, 54,
+ 63, 64, 73, 75, 80, 83, 36, 34, 35, 38, 42, 48, 51, 54,
+ 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38, 42, 48,
+ 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40,
+ 44, 50, 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38,
+ 39, 41, 45, 51, 56, 59, 65, 67, 75, 76, 85, 87, 90, 93,
+ 44, 41, 41, 43, 46, 53, 59, 63, 69, 71, 79, 80, 90, 91,
+ 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+ 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76,
+ 85, 86, 96, 98, 103, 100, 52, 48, 48, 49, 52, 59, 65, 70,
+ 78, 80, 90, 91, 101, 103, 105, 107, 53, 49, 49, 50, 53, 60,
+ 66, 71, 79, 82, 92, 93, 103, 105, 111, 107, 58, 53, 53, 53,
+ 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58, 54,
+ 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115,
+ 65, 60, 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119,
+ 122, 123, 66, 61, 60, 59, 63, 69, 75, 80, 90, 93, 106, 107,
+ 119, 121, 126, 123, 71, 65, 65, 63, 67, 73, 79, 84, 94, 97,
+ 111, 112, 125, 127, 131, 132, 74, 68, 67, 66, 69, 75, 81, 86,
+ 97, 100, 113, 115, 128, 130, 134, 132, 79, 72, 72, 70, 73, 79,
+ 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74, 73, 71,
+ 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+ 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151,
+ 86, 78, 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142,
+ 147, 151, 88, 81, 80, 77, 80, 86, 90, 98, 105, 112, 122, 127,
+ 140, 144, 152, 155, 91, 83, 82, 79, 80, 88, 90, 100, 103, 114,
+ 119, 130, 137, 148, 151, 155, 93, 85, 85, 81, 81, 90, 90, 102,
+ 103, 117, 117, 134, 134, 151, 152, 160},
+ {32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31,
+ 31, 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31,
+ 33, 38, 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33,
+ 40, 42, 46, 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41,
+ 43, 46, 46, 45, 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44,
+ 47, 46, 46, 47, 47, 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47,
+ 46, 45, 47, 47, 50, 51, 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46,
+ 45, 46, 47, 50, 50, 54, 55, 57, 58, 41, 42, 43, 47, 48, 49, 49, 48,
+ 49, 50, 52, 53, 57, 57, 59, 58, 42, 43, 43, 47, 48, 50, 49, 49, 50,
+ 50, 53, 54, 57, 58, 60, 61, 49, 46, 47, 48, 50, 53, 53, 53, 54, 54,
+ 57, 57, 60, 61, 62, 61, 49, 46, 47, 48, 50, 53, 53, 54, 54, 55, 57,
+ 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53, 54, 56, 57, 57, 60, 60,
+ 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56, 58, 58, 61, 61, 65,
+ 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61, 64, 64, 67, 68,
+ 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66, 69, 69, 70,
+ 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71, 73, 71,
+ 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74, 52,
+ 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+ 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50,
+ 49, 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50,
+ 52, 56, 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53,
+ 57, 61, 64, 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58,
+ 62, 65, 71, 72, 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63,
+ 66, 72, 73, 80, 81, 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67,
+ 73, 75, 82, 82, 89, 90, 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73,
+ 75, 82, 83, 89, 90, 92, 90, 64, 59, 58, 56, 58, 61, 65, 68, 74, 75,
+ 83, 83, 90, 91, 94, 95, 66, 60, 59, 57, 59, 62, 66, 69, 75, 76, 84,
+ 85, 91, 92, 94, 95, 67, 61, 60, 58, 59, 63, 66, 70, 74, 77, 82, 85,
+ 91, 93, 96, 96, 68, 62, 61, 58, 59, 64, 65, 71, 72, 78, 81, 86, 89,
+ 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71, 71, 79, 79, 87, 87, 95,
+ 95, 98}},
+ {{32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79,
+ 79, 87, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51, 51, 62,
+ 62, 75, 75, 82, 31, 32, 32, 32, 32, 35, 35, 42, 42, 51,
+ 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34, 34, 41,
+ 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34,
+ 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34,
+ 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77, 32, 32,
+ 32, 34, 34, 36, 36, 42, 42, 50, 50, 59, 59, 71, 71, 77,
+ 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+ 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58,
+ 58, 69, 69, 75, 34, 34, 34, 37, 37, 42, 42, 48, 48, 54,
+ 54, 63, 63, 73, 73, 79, 34, 34, 34, 37, 37, 42, 42, 48,
+ 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38, 38, 48,
+ 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38,
+ 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37,
+ 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84, 84, 89,
+ 39, 37, 37, 40, 40, 50, 50, 58, 58, 65, 65, 73, 73, 84,
+ 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+ 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71,
+ 71, 79, 79, 90, 90, 95, 48, 45, 45, 46, 46, 56, 56, 67,
+ 67, 76, 76, 85, 85, 96, 96, 102, 48, 45, 45, 46, 46, 56,
+ 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49, 49, 50,
+ 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49,
+ 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109,
+ 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98, 98, 110,
+ 110, 116, 58, 54, 54, 54, 54, 63, 63, 75, 75, 87, 87, 98,
+ 98, 110, 110, 116, 65, 60, 60, 58, 58, 68, 68, 79, 79, 92,
+ 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68, 79,
+ 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73,
+ 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63,
+ 63, 73, 73, 84, 84, 97, 97, 111, 111, 125, 125, 132, 79, 72,
+ 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133, 133, 141,
+ 79, 72, 72, 70, 70, 79, 79, 90, 90, 104, 104, 118, 118, 133,
+ 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106, 106, 121,
+ 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92, 106,
+ 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96,
+ 96, 109, 109, 124, 124, 141, 141, 149},
+ {32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31,
+ 31, 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31,
+ 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32,
+ 40, 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40,
+ 40, 46, 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43,
+ 47, 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47,
+ 47, 46, 46, 47, 47, 51, 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47,
+ 45, 45, 47, 47, 50, 50, 54, 54, 57, 37, 40, 40, 47, 47, 47, 47, 45,
+ 45, 47, 47, 50, 50, 54, 54, 57, 42, 43, 43, 47, 47, 50, 50, 49, 49,
+ 50, 50, 53, 53, 57, 57, 60, 42, 43, 43, 47, 47, 50, 50, 49, 49, 50,
+ 50, 53, 53, 57, 57, 60, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54,
+ 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53, 53, 53, 53, 54, 54, 57,
+ 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60,
+ 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57, 57, 60, 60, 64,
+ 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67,
+ 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67, 67, 69,
+ 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 50,
+ 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+ 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48,
+ 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49,
+ 49, 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49,
+ 55, 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56,
+ 56, 64, 64, 70, 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56,
+ 64, 64, 70, 70, 76, 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65,
+ 65, 72, 72, 79, 79, 85, 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65,
+ 72, 72, 79, 79, 85, 85, 88, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75,
+ 75, 82, 82, 89, 89, 92, 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75,
+ 82, 82, 89, 89, 92, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83,
+ 83, 90, 90, 93, 64, 59, 59, 56, 56, 61, 61, 68, 68, 75, 75, 83, 83,
+ 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69, 69, 77, 77, 84, 84, 92,
+ 92, 95}},
+ {{32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75,
+ 31, 32, 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75,
+ 31, 32, 32, 32, 32, 33, 35, 36, 41, 42, 50, 50, 58, 60, 67, 73,
+ 31, 32, 32, 32, 33, 33, 34, 36, 41, 41, 49, 49, 57, 59, 66, 72,
+ 31, 32, 32, 33, 33, 34, 35, 37, 41, 42, 49, 49, 57, 59, 66, 71,
+ 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50, 57, 59, 65, 71,
+ 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59, 65, 70,
+ 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+ 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 34, 34, 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73,
+ 35, 34, 34, 37, 38, 42, 47, 48, 52, 53, 59, 59, 65, 67, 73, 77,
+ 36, 35, 34, 37, 38, 43, 48, 49, 54, 54, 60, 60, 66, 68, 74, 78,
+ 38, 36, 36, 38, 40, 44, 49, 51, 56, 57, 63, 63, 69, 71, 77, 81,
+ 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65, 71, 73, 79, 84,
+ 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76, 81, 86,
+ 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+ 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 48, 45, 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96,
+ 53, 49, 49, 49, 49, 54, 59, 62, 71, 71, 81, 81, 89, 91, 98, 103,
+ 53, 50, 49, 50, 50, 54, 60, 63, 71, 72, 82, 82, 90, 92, 99, 103,
+ 57, 53, 52, 52, 52, 57, 62, 65, 74, 75, 85, 85, 94, 96, 103, 108,
+ 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87, 87, 95, 98, 105, 110,
+ 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89, 98, 101, 108, 114,
+ 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102, 105, 112, 118,
+ 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106, 114, 120,
+ 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119, 125,
+ 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133},
+ {32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31,
+ 31, 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31,
+ 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32,
+ 37, 39, 42, 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37,
+ 40, 42, 46, 46, 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41,
+ 44, 46, 46, 45, 45, 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45,
+ 47, 46, 46, 46, 47, 47, 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47,
+ 47, 45, 46, 47, 47, 50, 51, 53, 55, 37, 40, 40, 45, 47, 47, 47, 47,
+ 45, 46, 47, 47, 49, 50, 52, 54, 37, 40, 40, 45, 47, 47, 48, 47, 46,
+ 46, 47, 47, 49, 50, 53, 55, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49,
+ 50, 50, 53, 53, 56, 57, 42, 43, 43, 46, 47, 48, 50, 50, 49, 49, 50,
+ 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50, 52, 52, 53, 53, 53, 53,
+ 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 54, 56,
+ 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55, 56, 56, 58, 58,
+ 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57, 59, 60, 62,
+ 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61, 63, 65,
+ 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67, 49,
+ 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+ 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46,
+ 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47,
+ 47, 50, 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47,
+ 50, 54, 56, 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51,
+ 55, 57, 62, 62, 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55,
+ 57, 62, 63, 68, 68, 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58,
+ 63, 63, 69, 69, 74, 75, 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64,
+ 64, 70, 70, 75, 76, 79, 82, 58, 53, 53, 51, 51, 54, 57, 59, 64, 65,
+ 71, 71, 76, 77, 80, 83, 60, 55, 54, 53, 52, 55, 58, 60, 65, 66, 72,
+ 72, 77, 79, 82, 85, 60, 55, 55, 53, 53, 55, 59, 60, 65, 66, 73, 73,
+ 78, 79, 83, 85, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80,
+ 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62, 67, 68, 75, 75, 80, 82,
+ 86, 89}},
+ {{32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63,
+ 31, 32, 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62,
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 41, 41, 48, 50, 54, 61, 61,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 32, 33, 33, 34, 34, 37, 41, 41, 47, 49, 53, 59, 59,
+ 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48, 49, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53, 59, 59,
+ 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58,
+ 33, 33, 33, 35, 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62,
+ 34, 34, 34, 35, 37, 37, 41, 42, 44, 48, 48, 53, 54, 57, 63, 63,
+ 34, 34, 34, 35, 37, 37, 43, 44, 46, 50, 50, 55, 56, 59, 65, 65,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58, 60, 63, 68, 68,
+ 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67, 72, 72,
+ 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+ 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 44, 41, 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79,
+ 47, 44, 44, 44, 45, 45, 53, 56, 59, 66, 66, 73, 75, 78, 84, 84,
+ 48, 45, 45, 45, 46, 46, 54, 56, 60, 67, 67, 74, 76, 79, 85, 85,
+ 50, 47, 46, 47, 47, 47, 55, 58, 61, 68, 68, 76, 78, 82, 88, 88,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86, 92, 92,
+ 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+ 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98,
+ 61, 57, 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 65, 61, 60, 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105,
+ 70, 65, 64, 63, 62, 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109},
+ {32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31,
+ 31, 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31,
+ 31, 34, 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32,
+ 34, 39, 39, 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35,
+ 40, 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40,
+ 40, 44, 46, 45, 45, 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42,
+ 46, 47, 46, 45, 45, 47, 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46,
+ 47, 46, 46, 46, 47, 47, 49, 51, 51, 35, 37, 37, 40, 44, 44, 46, 47,
+ 46, 45, 45, 47, 47, 48, 51, 51, 37, 39, 40, 43, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 48, 50, 50, 37, 39, 40, 43, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 48, 50, 50, 41, 42, 42, 44, 47, 47, 49, 49, 49, 48, 48,
+ 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47, 49, 50, 50, 49, 49, 50,
+ 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51, 51, 51, 51, 52, 52,
+ 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55,
+ 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54, 54, 55, 57,
+ 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58, 59, 59,
+ 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60, 48,
+ 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+ 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45,
+ 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46,
+ 46, 46, 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46,
+ 46, 52, 54, 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47,
+ 52, 54, 56, 60, 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53,
+ 54, 57, 61, 61, 65, 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54,
+ 57, 61, 61, 65, 66, 68, 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58,
+ 62, 62, 67, 68, 70, 73, 73, 54, 51, 50, 49, 49, 49, 54, 55, 58, 62,
+ 62, 67, 68, 70, 73, 73, 55, 51, 51, 50, 49, 49, 54, 56, 58, 63, 63,
+ 68, 69, 71, 74, 74, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69,
+ 70, 73, 76, 76, 57, 53, 52, 51, 50, 50, 55, 56, 59, 64, 64, 69, 70,
+ 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58, 61, 65, 65, 70, 72, 74,
+ 78, 78}},
+ {{32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31,
+ 32, 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32,
+ 32, 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32,
+ 32, 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32,
+ 32, 32, 33, 34, 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32,
+ 33, 33, 34, 34, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33,
+ 33, 35, 35, 36, 41, 41, 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34,
+ 36, 36, 38, 42, 42, 45, 49, 49, 54, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 38, 42, 42, 45, 50, 50, 54, 32, 32, 32, 33, 34, 34, 35, 37, 37,
+ 38, 42, 42, 45, 49, 49, 54, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39,
+ 42, 42, 45, 49, 49, 53, 32, 32, 33, 33, 35, 35, 36, 38, 38, 39, 42,
+ 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36, 36, 39, 40, 41, 44, 44,
+ 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50,
+ 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44, 48, 48, 50, 54,
+ 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50, 53, 57, 57,
+ 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+ 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64, 38,
+ 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+ 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37,
+ 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40,
+ 42, 42, 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42,
+ 43, 45, 52, 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43,
+ 45, 52, 54, 56, 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47,
+ 54, 56, 58, 66, 66, 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55,
+ 56, 59, 67, 67, 70, 76, 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57,
+ 60, 67, 67, 71, 77, 77, 81, 53, 50, 49, 49, 49, 49, 51, 58, 59, 62,
+ 71, 71, 74, 81, 81, 86, 53, 51, 49, 49, 50, 50, 51, 59, 60, 63, 71,
+ 71, 75, 82, 82, 87, 55, 52, 51, 51, 51, 51, 53, 60, 61, 64, 72, 72,
+ 76, 83, 83, 88, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79,
+ 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62, 63, 67, 75, 75, 79, 87,
+ 87, 92},
+ {32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31,
+ 31, 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31,
+ 31, 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31,
+ 32, 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32,
+ 38, 39, 40, 45, 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38,
+ 40, 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40,
+ 41, 45, 46, 46, 45, 45, 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43,
+ 46, 47, 46, 45, 45, 46, 47, 47, 49, 33, 35, 36, 36, 41, 43, 44, 46,
+ 47, 46, 46, 46, 46, 47, 47, 49, 34, 36, 37, 37, 42, 44, 45, 47, 47,
+ 47, 45, 45, 46, 47, 47, 49, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47,
+ 45, 45, 46, 47, 47, 48, 37, 39, 40, 41, 45, 47, 47, 47, 47, 47, 45,
+ 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47, 47, 48, 48, 48, 47, 47,
+ 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50,
+ 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50, 49, 49, 50, 50,
+ 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51, 52, 52, 52,
+ 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55, 48,
+ 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+ 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46,
+ 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45,
+ 46, 46, 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45,
+ 46, 47, 52, 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46,
+ 47, 52, 53, 55, 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48,
+ 53, 54, 55, 59, 59, 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53,
+ 54, 55, 59, 59, 61, 64, 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54,
+ 55, 60, 60, 61, 64, 64, 66, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56,
+ 61, 61, 63, 66, 66, 68, 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61,
+ 61, 63, 66, 66, 68, 53, 50, 48, 48, 48, 48, 49, 54, 54, 56, 61, 61,
+ 63, 67, 67, 69, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65,
+ 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55, 55, 57, 62, 62, 65, 68,
+ 68, 71}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31,
+ 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32,
+ 32, 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33,
+ 33, 34, 35, 35, 38, 41, 41, 44, 49, 31, 32, 32, 32, 33, 34, 34, 34,
+ 35, 36, 36, 39, 42, 42, 44, 49, 32, 32, 32, 32, 33, 34, 34, 34, 36,
+ 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 34, 34, 34, 36, 36,
+ 36, 39, 42, 42, 45, 50, 32, 32, 32, 32, 33, 35, 35, 35, 37, 37, 37,
+ 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41,
+ 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38, 41, 42,
+ 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40, 40, 42, 44, 44,
+ 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50,
+ 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48, 50, 54,
+ 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55, 35,
+ 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+ 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34,
+ 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36,
+ 37, 40, 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39,
+ 40, 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40,
+ 40, 42, 48, 50, 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41,
+ 43, 49, 51, 51, 56, 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45,
+ 51, 53, 53, 59, 63, 63, 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51,
+ 53, 53, 59, 63, 63, 66, 71, 44, 43, 42, 42, 42, 43, 43, 45, 51, 54,
+ 54, 59, 64, 64, 67, 72, 47, 45, 44, 44, 44, 45, 45, 47, 53, 56, 56,
+ 61, 66, 66, 69, 75, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62,
+ 67, 67, 70, 76, 48, 46, 45, 45, 45, 46, 46, 48, 54, 56, 56, 62, 67,
+ 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50, 56, 58, 58, 64, 69, 69,
+ 73, 79},
+ {32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31,
+ 31, 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31,
+ 31, 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31,
+ 31, 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32,
+ 34, 39, 39, 40, 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35,
+ 40, 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40,
+ 40, 41, 44, 46, 46, 45, 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40,
+ 41, 45, 46, 46, 45, 45, 45, 46, 48, 33, 34, 35, 35, 37, 42, 42, 43,
+ 46, 47, 47, 46, 45, 45, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46,
+ 47, 47, 46, 46, 46, 46, 47, 33, 35, 36, 36, 38, 43, 43, 44, 46, 47,
+ 47, 46, 46, 46, 46, 47, 35, 37, 38, 38, 41, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50,
+ 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49, 50, 50,
+ 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51, 47,
+ 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+ 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46,
+ 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46,
+ 46, 47, 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46,
+ 47, 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47,
+ 47, 48, 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46,
+ 47, 51, 53, 53, 55, 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47,
+ 51, 53, 53, 56, 58, 58, 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51,
+ 53, 53, 56, 58, 58, 59, 61, 49, 47, 45, 45, 45, 46, 46, 47, 52, 53,
+ 53, 56, 58, 58, 60, 62, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54,
+ 57, 59, 59, 61, 63, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57,
+ 59, 59, 61, 64, 50, 48, 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59,
+ 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48, 52, 54, 54, 58, 60, 60,
+ 62, 65}},
+ {{32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 34, 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 37, 41, 41, 31, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 35, 35, 35, 38, 41, 41, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35,
+ 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36,
+ 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36,
+ 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34, 34, 36, 37, 37, 37,
+ 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42,
+ 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42,
+ 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45, 34,
+ 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+ 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34,
+ 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34,
+ 34, 36, 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34,
+ 36, 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36,
+ 38, 38, 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38,
+ 38, 38, 43, 48, 48, 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39,
+ 39, 44, 49, 49, 49, 52, 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40,
+ 45, 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45,
+ 50, 50, 50, 54, 58, 58, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50,
+ 50, 50, 54, 58, 58, 41, 40, 39, 39, 39, 40, 42, 42, 42, 46, 52, 52,
+ 52, 56, 60, 60, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53,
+ 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43, 43, 48, 53, 53, 53, 58,
+ 63, 63},
+ {32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31,
+ 31, 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31,
+ 31, 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31,
+ 31, 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+ 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32,
+ 35, 39, 39, 39, 42, 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35,
+ 40, 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40,
+ 40, 40, 42, 46, 46, 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40,
+ 40, 42, 46, 46, 46, 45, 45, 45, 32, 33, 34, 34, 34, 37, 41, 41, 41,
+ 44, 46, 46, 46, 46, 45, 45, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45,
+ 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47,
+ 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39, 43, 43, 43, 45, 47, 47,
+ 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45, 45, 46, 47, 47, 47,
+ 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45,
+ 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46, 45, 45,
+ 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47, 42,
+ 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+ 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43,
+ 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44,
+ 44, 46, 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46,
+ 47, 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47,
+ 48, 48, 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48,
+ 48, 48, 50, 53, 53, 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47,
+ 47, 50, 53, 53, 53, 54, 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47,
+ 50, 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50,
+ 53, 53, 53, 54, 56, 56, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53,
+ 53, 53, 54, 56, 56, 48, 47, 45, 45, 45, 46, 46, 46, 46, 49, 53, 53,
+ 53, 55, 57, 57, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53,
+ 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46, 46, 49, 53, 53, 53, 56,
+ 58, 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36,
+ 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38, 32,
+ 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+ 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33,
+ 33, 33, 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33,
+ 33, 35, 36, 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34,
+ 35, 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35,
+ 36, 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37,
+ 37, 37, 40, 43, 44, 44, 44, 35, 35, 34, 34, 34, 34, 36, 37, 38, 38,
+ 38, 41, 45, 47, 47, 47, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38,
+ 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42,
+ 46, 48, 48, 48, 36, 35, 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46,
+ 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38, 39, 39, 39, 42, 46, 49,
+ 49, 49},
+ {32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31,
+ 31, 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31,
+ 31, 31, 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31,
+ 31, 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31,
+ 31, 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31,
+ 31, 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32,
+ 34, 37, 39, 39, 39, 41, 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34,
+ 38, 39, 39, 39, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38,
+ 40, 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40,
+ 40, 40, 42, 44, 46, 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40,
+ 40, 42, 44, 46, 46, 46, 31, 32, 33, 33, 33, 33, 36, 39, 41, 41, 41,
+ 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35, 37, 40, 42, 42, 42, 44,
+ 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46,
+ 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47,
+ 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44, 46, 47, 47,
+ 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47, 47, 47,
+ 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47, 37,
+ 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+ 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39,
+ 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41,
+ 41, 41, 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42,
+ 42, 44, 46, 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43,
+ 44, 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44,
+ 46, 47, 47, 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46,
+ 47, 47, 47, 48, 49, 50, 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 49, 50, 51, 51, 51, 47, 46, 46, 46, 46, 46, 46, 47, 48, 48,
+ 48, 49, 51, 52, 52, 52, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48,
+ 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50,
+ 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52,
+ 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 52, 53,
+ 53, 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34,
+ 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33,
+ 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34,
+ 34, 35, 35, 35, 35, 36, 37, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+ 35, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36,
+ 36, 36, 36, 36, 38, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 37,
+ 38, 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31,
+ 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31,
+ 31, 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31,
+ 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 32, 32, 32, 32,
+ 34, 36, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 32, 34,
+ 37, 39, 39, 39, 39, 40, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37,
+ 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40,
+ 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40,
+ 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40,
+ 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38, 40, 40, 40, 40,
+ 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41, 41, 41, 42,
+ 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42, 43, 44,
+ 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33,
+ 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+ 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35,
+ 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37,
+ 37, 37, 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38,
+ 38, 38, 39, 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39,
+ 39, 40, 42, 44, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40,
+ 41, 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41,
+ 43, 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43,
+ 45, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45,
+ 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41, 41, 42, 43, 45, 47,
+ 47, 47, 47, 47, 48, 40, 41, 41, 42, 42, 42, 42, 42, 44, 45, 47, 47,
+ 47, 47, 47, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47,
+ 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 48, 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+ 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 36,
+ 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38,
+ 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+ 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32,
+ 32, 33, 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36,
+ 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36,
+ 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+ 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36,
+ 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37,
+ 38, 40, 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38,
+ 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 36, 38, 39, 40,
+ 42, 44}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32}}};
+constexpr uint8_t
+ kQuantizerMatrix4x4[kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes]
+ [10] = {{{32, 43, 67, 73, 94, 137, 97, 110, 150, 200},
+ {35, 46, 60, 57, 69, 90, 66, 71, 90, 109}},
+ {{32, 41, 63, 69, 88, 127, 92, 103, 140, 184},
+ {33, 45, 58, 56, 66, 86, 64, 69, 87, 105}},
+ {{32, 38, 56, 63, 78, 113, 86, 97, 130, 169},
+ {32, 45, 55, 53, 62, 80, 63, 67, 84, 101}},
+ {{32, 37, 54, 58, 72, 102, 81, 91, 121, 156},
+ {32, 45, 54, 51, 59, 75, 61, 65, 81, 97}},
+ {{32, 34, 49, 53, 64, 91, 75, 81, 112, 140},
+ {32, 46, 53, 49, 55, 70, 58, 62, 78, 91}},
+ {{32, 34, 48, 49, 60, 82, 72, 79, 104, 134},
+ {32, 46, 53, 47, 54, 66, 57, 60, 75, 89}},
+ {{32, 33, 39, 45, 51, 71, 62, 64, 87, 108},
+ {31, 42, 48, 47, 50, 61, 53, 54, 67, 78}},
+ {{32, 33, 38, 42, 46, 63, 55, 57, 75, 92},
+ {31, 41, 48, 46, 48, 58, 51, 51, 62, 71}},
+ {{32, 32, 35, 38, 40, 54, 51, 49, 64, 81},
+ {31, 38, 47, 47, 46, 54, 49, 46, 57, 66}},
+ {{32, 32, 34, 35, 37, 48, 43, 43, 54, 65},
+ {31, 37, 44, 47, 47, 53, 47, 45, 53, 59}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54}},
+ {{32, 32, 32, 32, 33, 35, 35, 35, 38, 46},
+ {31, 32, 34, 38, 41, 47, 46, 46, 47, 52}},
+ {{31, 32, 32, 32, 32, 33, 32, 33, 34, 35},
+ {31, 31, 32, 34, 35, 39, 38, 40, 43, 47}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33},
+ {31, 31, 31, 31, 31, 32, 34, 35, 35, 39}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix8x8
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][36] = {
+ {{32, 32, 35, 38, 40, 54, 51, 49, 65, 82, 68, 63,
+ 78, 97, 117, 84, 76, 91, 111, 134, 152, 95, 89, 98,
+ 113, 138, 159, 183, 109, 102, 106, 121, 142, 168, 199, 220},
+ {31, 38, 47, 47, 46, 54, 50, 47, 57, 66, 57, 52,
+ 61, 72, 82, 63, 57, 66, 77, 88, 96, 67, 62, 67,
+ 75, 86, 95, 104, 71, 67, 68, 75, 84, 95, 107, 113}},
+ {{32, 32, 35, 37, 39, 51, 47, 46, 60, 73, 62, 58,
+ 71, 87, 105, 78, 72, 84, 100, 121, 140, 90, 84, 93,
+ 106, 129, 148, 169, 102, 96, 100, 113, 132, 155, 183, 201},
+ {31, 38, 47, 47, 47, 53, 48, 46, 55, 62, 54, 50,
+ 58, 67, 76, 61, 55, 63, 72, 83, 91, 66, 61, 65,
+ 73, 84, 92, 101, 69, 65, 66, 73, 82, 92, 103, 109}},
+ {{32, 32, 34, 35, 37, 48, 46, 45, 56, 70, 57, 54,
+ 64, 80, 93, 76, 70, 79, 96, 111, 134, 85, 79, 87,
+ 100, 121, 138, 156, 96, 90, 93, 105, 122, 144, 168, 184},
+ {31, 36, 43, 47, 47, 53, 48, 46, 54, 61, 52, 49,
+ 55, 65, 71, 60, 55, 60, 70, 78, 89, 64, 59, 63,
+ 71, 81, 89, 97, 67, 63, 64, 71, 79, 89, 99, 104}},
+ {{32, 32, 33, 35, 36, 46, 42, 42, 52, 63, 53, 51,
+ 60, 73, 86, 68, 64, 72, 84, 100, 117, 78, 74, 80,
+ 92, 109, 128, 140, 90, 84, 87, 98, 114, 133, 155, 168},
+ {31, 34, 39, 46, 47, 52, 47, 45, 52, 58, 50, 48,
+ 54, 62, 68, 57, 53, 58, 65, 73, 82, 61, 57, 61,
+ 68, 77, 86, 91, 65, 61, 62, 68, 76, 86, 95, 100}},
+ {{32, 32, 33, 34, 35, 39, 39, 40, 46, 56, 50, 48,
+ 53, 65, 78, 62, 59, 63, 75, 90, 105, 76, 71, 74,
+ 86, 101, 118, 134, 84, 79, 81, 92, 106, 123, 142, 153},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 55, 49, 47,
+ 50, 58, 65, 54, 51, 53, 61, 69, 76, 60, 56, 57,
+ 65, 73, 82, 89, 64, 59, 60, 66, 74, 83, 92, 96}},
+ {{32, 32, 33, 34, 35, 39, 38, 39, 45, 54, 46, 45,
+ 51, 61, 71, 56, 54, 58, 69, 80, 92, 68, 64, 68,
+ 78, 90, 103, 117, 78, 74, 76, 86, 99, 113, 128, 140},
+ {31, 34, 39, 42, 45, 48, 47, 46, 49, 54, 48, 46,
+ 50, 56, 61, 52, 49, 52, 58, 65, 71, 57, 53, 55,
+ 61, 68, 75, 82, 61, 57, 58, 64, 71, 79, 86, 91}},
+ {{31, 32, 32, 32, 33, 35, 35, 35, 38, 48, 42, 41,
+ 43, 54, 63, 51, 49, 49, 59, 71, 81, 59, 56, 56,
+ 66, 77, 89, 98, 69, 65, 64, 73, 85, 97, 108, 119},
+ {31, 32, 35, 38, 42, 47, 48, 47, 48, 53, 47, 45,
+ 45, 53, 58, 50, 47, 47, 54, 61, 66, 53, 50, 49,
+ 56, 63, 69, 73, 57, 54, 52, 58, 65, 72, 77, 82}},
+ {{31, 32, 32, 32, 32, 35, 34, 34, 37, 42, 38, 37,
+ 40, 47, 54, 46, 44, 45, 52, 60, 69, 52, 49, 49,
+ 56, 65, 75, 82, 63, 59, 58, 65, 73, 84, 92, 105},
+ {31, 31, 32, 38, 40, 47, 44, 44, 47, 50, 47, 45,
+ 46, 51, 54, 48, 46, 46, 51, 56, 61, 50, 47, 47,
+ 52, 57, 63, 66, 55, 52, 50, 54, 60, 66, 70, 76}},
+ {{31, 32, 32, 32, 32, 34, 34, 33, 35, 39, 35, 34,
+ 37, 42, 48, 41, 40, 41, 47, 53, 60, 47, 44, 45,
+ 51, 57, 65, 71, 53, 50, 51, 55, 61, 70, 77, 85},
+ {31, 31, 32, 35, 36, 41, 42, 42, 45, 48, 48, 46,
+ 47, 50, 53, 47, 45, 45, 49, 53, 57, 49, 46, 46,
+ 50, 54, 59, 61, 51, 48, 48, 51, 54, 60, 64, 68}},
+ {{31, 31, 32, 32, 32, 33, 32, 32, 34, 35, 34, 34,
+ 35, 37, 41, 37, 36, 38, 39, 45, 51, 43, 41, 42,
+ 42, 49, 56, 63, 47, 44, 45, 46, 52, 59, 67, 71},
+ {31, 31, 32, 34, 35, 39, 37, 40, 43, 47, 43, 43,
+ 45, 47, 49, 48, 46, 46, 47, 50, 53, 47, 45, 45,
+ 45, 50, 55, 58, 49, 46, 46, 46, 50, 55, 60, 61}},
+ {{31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 33, 33,
+ 34, 35, 37, 34, 34, 35, 36, 39, 43, 37, 36, 37,
+ 38, 41, 46, 51, 41, 39, 40, 41, 44, 49, 54, 58},
+ {31, 31, 31, 32, 33, 35, 35, 37, 39, 43, 39, 41,
+ 42, 45, 47, 45, 44, 45, 47, 48, 50, 48, 46, 46,
+ 47, 48, 51, 53, 48, 46, 45, 46, 47, 51, 54, 56}},
+ {{31, 31, 32, 31, 32, 32, 32, 32, 32, 33, 32, 32,
+ 32, 34, 35, 32, 33, 33, 34, 35, 36, 34, 34, 33,
+ 35, 36, 38, 39, 35, 35, 34, 36, 38, 40, 42, 48},
+ {31, 31, 31, 30, 31, 32, 34, 34, 35, 39, 36, 37,
+ 39, 42, 46, 39, 40, 41, 44, 47, 47, 42, 42, 42,
+ 45, 47, 48, 48, 48, 47, 46, 47, 47, 49, 50, 53}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 33, 34, 32, 32, 32,
+ 32, 34, 34, 35, 33, 33, 33, 33, 35, 35, 36, 38},
+ {31, 31, 31, 31, 31, 31, 30, 31, 31, 32, 34, 34,
+ 35, 35, 39, 35, 35, 36, 36, 40, 41, 37, 38, 39,
+ 40, 43, 44, 47, 40, 41, 41, 42, 44, 45, 47, 48}},
+ {{31, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 33, 33},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 32, 31, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 36, 39, 33, 34, 34, 35, 35, 36, 39, 39}},
+ {{31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32},
+ {31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31}}};
+constexpr uint8_t kQuantizerMatrix32x32
+ [kNumQuantizerLevelsForQuantizerMatrix][kNumPlaneTypes][528] = {
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 33,
+ 33, 32, 32, 32, 33, 34, 35, 34, 34, 33, 34, 35, 37, 39,
+ 35, 34, 34, 35, 36, 37, 41, 43, 36, 35, 34, 35, 36, 38,
+ 42, 45, 48, 39, 38, 37, 38, 39, 40, 45, 47, 50, 54, 44,
+ 42, 41, 41, 42, 42, 47, 50, 54, 58, 63, 46, 44, 42, 43,
+ 44, 44, 49, 52, 55, 59, 65, 67, 48, 46, 44, 45, 45, 46,
+ 51, 53, 57, 61, 67, 69, 71, 54, 51, 49, 49, 50, 49, 54,
+ 57, 60, 65, 71, 74, 76, 82, 59, 56, 54, 54, 54, 53, 58,
+ 61, 64, 69, 75, 78, 80, 87, 92, 62, 59, 56, 56, 56, 55,
+ 60, 63, 66, 71, 77, 80, 83, 89, 95, 98, 65, 62, 59, 59,
+ 59, 58, 63, 65, 68, 73, 79, 82, 85, 92, 98, 101, 105, 71,
+ 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90, 97, 103,
+ 107, 111, 117, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90,
+ 93, 96, 104, 110, 114, 118, 125, 134, 81, 77, 73, 73, 72, 70,
+ 75, 77, 80, 85, 91, 94, 97, 105, 111, 115, 119, 126, 135, 137,
+ 83, 78, 75, 74, 74, 72, 76, 79, 81, 86, 92, 95, 99, 106,
+ 113, 117, 121, 128, 137, 138, 140, 88, 84, 80, 79, 78, 76, 80,
+ 82, 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147,
+ 152, 91, 86, 83, 82, 81, 79, 81, 84, 88, 92, 95, 100, 107,
+ 110, 115, 123, 127, 132, 140, 147, 151, 154, 159, 94, 89, 86, 85,
+ 84, 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136,
+ 139, 146, 156, 158, 161, 166, 97, 92, 90, 88, 86, 85, 84, 89,
+ 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143, 148, 152, 163,
+ 166, 168, 174, 101, 95, 93, 91, 89, 89, 87, 91, 93, 98, 101,
+ 105, 111, 113, 120, 126, 130, 138, 142, 149, 157, 159, 171, 174, 176,
+ 183, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111,
+ 116, 122, 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191,
+ 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110, 113, 120,
+ 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191, 193, 200,
+ 111, 105, 104, 101, 100, 99, 97, 96, 102, 103, 109, 111, 117, 120,
+ 125, 131, 135, 143, 146, 156, 158, 168, 173, 180, 189, 195, 200, 202,
+ 210, 115, 109, 108, 104, 104, 102, 101, 100, 103, 106, 111, 113, 119,
+ 121, 129, 131, 140, 142, 151, 155, 162, 168, 176, 183, 188, 199, 204,
+ 210, 212, 220, 119, 113, 112, 107, 107, 106, 105, 103, 105, 110, 112,
+ 117, 120, 125, 130, 135, 140, 145, 152, 157, 165, 169, 179, 183, 193,
+ 197, 210, 214, 220, 222, 231, 123, 116, 116, 111, 111, 109, 110, 107,
+ 107, 114, 114, 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176,
+ 177, 190, 191, 204, 206, 222, 224, 230, 232, 242},
+ {32, 31, 31, 30, 31, 32, 32, 33, 33, 35, 33, 34, 35, 37,
+ 39, 36, 38, 40, 41, 43, 47, 41, 42, 42, 43, 45, 47, 48,
+ 45, 45, 44, 45, 46, 47, 49, 50, 49, 47, 46, 47, 47, 48,
+ 50, 51, 53, 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 49,
+ 47, 45, 45, 45, 45, 49, 51, 53, 55, 58, 50, 47, 45, 46,
+ 46, 46, 49, 51, 54, 56, 59, 60, 50, 48, 46, 46, 46, 46,
+ 50, 52, 54, 56, 60, 60, 61, 52, 50, 47, 47, 47, 47, 50,
+ 52, 54, 57, 61, 62, 63, 66, 54, 52, 49, 49, 49, 48, 52,
+ 53, 55, 58, 62, 64, 65, 68, 71, 56, 53, 51, 50, 50, 49,
+ 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 57, 54, 52, 51,
+ 51, 50, 53, 55, 56, 60, 63, 65, 67, 70, 73, 75, 76, 60,
+ 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72, 75,
+ 77, 79, 82, 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67,
+ 69, 71, 75, 78, 80, 82, 85, 89, 64, 61, 58, 57, 57, 55,
+ 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90,
+ 65, 61, 58, 58, 57, 55, 58, 60, 61, 64, 68, 70, 71, 75,
+ 79, 81, 83, 86, 90, 91, 91, 67, 63, 61, 60, 59, 57, 60,
+ 61, 63, 66, 69, 70, 73, 77, 79, 81, 85, 88, 90, 92, 94,
+ 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74,
+ 75, 78, 82, 84, 86, 90, 93, 94, 96, 98, 69, 65, 63, 62,
+ 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87,
+ 89, 92, 96, 97, 98, 100, 70, 66, 64, 63, 62, 61, 60, 63,
+ 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+ 99, 100, 102, 71, 67, 66, 64, 63, 62, 61, 63, 64, 67, 68,
+ 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100, 101, 102,
+ 104, 72, 68, 67, 65, 64, 64, 61, 63, 65, 67, 68, 71, 73,
+ 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102, 103, 104, 106,
+ 73, 69, 68, 66, 65, 65, 63, 63, 66, 67, 69, 71, 73, 76,
+ 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104, 105, 106, 109,
+ 74, 70, 70, 67, 66, 66, 64, 63, 66, 67, 70, 71, 74, 75,
+ 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68, 70, 71, 74,
+ 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+ 110, 111, 113, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70,
+ 72, 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104,
+ 105, 109, 111, 112, 113, 116, 78, 74, 74, 70, 70, 69, 69, 66,
+ 66, 70, 70, 74, 74, 77, 78, 82, 82, 86, 87, 92, 92, 96,
+ 97, 102, 102, 107, 107, 112, 113, 115, 115, 118}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 33, 34, 35, 32, 33, 33, 33, 34, 36, 36,
+ 34, 34, 33, 34, 35, 37, 38, 39, 36, 35, 34, 35, 36, 38,
+ 40, 42, 48, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 39,
+ 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 44, 42, 41, 41,
+ 42, 42, 44, 47, 54, 56, 58, 63, 47, 45, 44, 44, 45, 45,
+ 47, 50, 56, 58, 60, 66, 69, 49, 47, 46, 45, 46, 46, 48,
+ 51, 57, 60, 62, 68, 71, 73, 54, 51, 50, 49, 50, 49, 51,
+ 54, 60, 63, 65, 71, 75, 77, 82, 59, 56, 54, 54, 54, 53,
+ 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 61, 58, 56, 56,
+ 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 65,
+ 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92,
+ 98, 101, 105, 71, 68, 65, 65, 64, 63, 65, 68, 73, 76, 78,
+ 84, 89, 92, 97, 103, 106, 111, 117, 76, 72, 70, 69, 68, 66,
+ 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122, 127,
+ 80, 76, 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98,
+ 104, 110, 113, 118, 125, 130, 134, 83, 78, 76, 75, 74, 72, 73,
+ 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121, 128, 133, 137,
+ 140, 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100,
+ 103, 109, 116, 119, 124, 131, 136, 140, 144, 147, 89, 85, 82, 81,
+ 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114, 120, 128,
+ 131, 136, 146, 147, 150, 155, 92, 88, 85, 84, 82, 81, 80, 85,
+ 86, 90, 95, 97, 102, 107, 110, 117, 122, 125, 134, 138, 142, 152,
+ 154, 156, 162, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95,
+ 99, 105, 106, 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163,
+ 169, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102, 104,
+ 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170, 176,
+ 101, 96, 95, 92, 91, 90, 88, 88, 93, 95, 99, 103, 106, 112,
+ 114, 121, 124, 131, 136, 140, 149, 151, 160, 165, 173, 176, 178, 184,
+ 104, 99, 98, 95, 94, 93, 91, 90, 95, 96, 102, 103, 109, 112,
+ 117, 122, 125, 133, 136, 145, 146, 156, 160, 167, 174, 180, 184, 186,
+ 193, 108, 102, 101, 98, 97, 96, 95, 93, 97, 100, 104, 106, 111,
+ 113, 121, 122, 130, 132, 140, 143, 150, 155, 162, 169, 174, 183, 188,
+ 192, 194, 201, 111, 105, 105, 101, 100, 99, 98, 96, 98, 103, 105,
+ 109, 112, 117, 121, 125, 130, 135, 141, 146, 152, 156, 165, 169, 178,
+ 181, 193, 196, 201, 202, 210, 114, 109, 109, 104, 104, 102, 102, 99,
+ 100, 106, 106, 113, 113, 120, 121, 129, 130, 139, 140, 151, 151, 162,
+ 162, 175, 176, 187, 188, 203, 204, 210, 211, 219},
+ {32, 31, 31, 30, 31, 31, 31, 32, 32, 33, 33, 34, 35, 36, 39,
+ 36, 38, 39, 40, 43, 47, 38, 40, 41, 41, 44, 47, 47, 41, 42,
+ 42, 43, 45, 47, 48, 48, 49, 47, 46, 46, 47, 48, 49, 50, 53,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 48, 47, 46, 45, 46,
+ 46, 48, 49, 53, 54, 54, 49, 47, 45, 45, 45, 45, 47, 49, 53,
+ 55, 55, 58, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62,
+ 52, 50, 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66,
+ 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62, 64, 66, 68,
+ 71, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58, 59, 63, 65, 66,
+ 69, 72, 73, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63,
+ 66, 67, 70, 73, 74, 76, 60, 57, 55, 54, 53, 52, 53, 55, 58,
+ 60, 61, 65, 68, 69, 72, 75, 77, 79, 82, 62, 59, 57, 56, 55,
+ 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86,
+ 63, 60, 58, 57, 56, 54, 55, 57, 60, 62, 63, 67, 70, 71, 75,
+ 78, 79, 82, 85, 87, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61,
+ 63, 64, 68, 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 66, 63,
+ 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80, 81,
+ 84, 87, 90, 91, 93, 94, 67, 64, 62, 61, 59, 58, 58, 60, 63,
+ 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89, 93, 94, 95, 97,
+ 68, 65, 63, 62, 60, 59, 58, 61, 62, 64, 67, 68, 71, 74, 75,
+ 79, 81, 83, 87, 89, 91, 95, 96, 97, 99, 69, 66, 64, 63, 61,
+ 61, 59, 61, 62, 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88,
+ 91, 92, 97, 98, 98, 101, 70, 67, 65, 63, 62, 62, 60, 61, 63,
+ 65, 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99,
+ 100, 100, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+ 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102,
+ 105, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+ 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107,
+ 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+ 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109,
+ 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76,
+ 78, 80, 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108,
+ 111, 75, 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75,
+ 75, 79, 79, 83, 84, 88, 89, 93, 93, 98, 98, 102, 103, 108, 108,
+ 110, 110, 113}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 34, 34, 35,
+ 34, 34, 33, 33, 35, 36, 37, 39, 34, 34, 34, 34, 36, 36,
+ 37, 41, 42, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 39,
+ 38, 38, 37, 39, 40, 40, 45, 47, 50, 54, 41, 39, 39, 38,
+ 40, 40, 41, 46, 48, 51, 55, 56, 44, 42, 41, 41, 42, 42,
+ 42, 47, 50, 54, 58, 59, 63, 48, 46, 45, 44, 45, 45, 45,
+ 50, 53, 56, 61, 62, 66, 70, 49, 47, 46, 45, 46, 46, 46,
+ 51, 53, 57, 62, 63, 68, 71, 73, 54, 51, 50, 49, 50, 49,
+ 49, 54, 56, 60, 65, 67, 71, 76, 77, 82, 58, 55, 54, 53,
+ 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 59,
+ 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+ 87, 91, 93, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73,
+ 75, 79, 85, 87, 92, 97, 99, 105, 69, 66, 64, 63, 63, 62,
+ 61, 66, 68, 71, 76, 78, 83, 88, 90, 96, 100, 102, 109, 113,
+ 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+ 92, 97, 102, 104, 111, 115, 117, 80, 76, 73, 72, 71, 70, 69,
+ 74, 76, 79, 84, 86, 90, 96, 98, 104, 109, 111, 118, 123, 125,
+ 134, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+ 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 83, 78, 76, 75,
+ 74, 73, 72, 76, 78, 81, 86, 88, 92, 98, 100, 106, 111, 113,
+ 121, 126, 128, 137, 139, 140, 87, 83, 81, 79, 78, 77, 75, 80,
+ 82, 85, 90, 91, 96, 101, 103, 110, 114, 117, 125, 129, 133, 142,
+ 143, 145, 150, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87, 89,
+ 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+ 156, 93, 88, 86, 84, 83, 82, 80, 82, 85, 89, 90, 96, 98,
+ 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153, 156, 157, 163,
+ 95, 90, 89, 86, 85, 85, 83, 83, 88, 89, 93, 97, 99, 105,
+ 106, 113, 116, 122, 127, 130, 139, 140, 148, 153, 159, 162, 164, 169,
+ 98, 93, 92, 89, 88, 87, 86, 85, 89, 90, 96, 97, 102, 105,
+ 109, 114, 117, 124, 126, 134, 136, 144, 148, 154, 160, 166, 169, 170,
+ 176, 101, 96, 95, 91, 91, 90, 89, 87, 90, 93, 97, 99, 104,
+ 105, 112, 113, 121, 122, 130, 133, 139, 144, 150, 155, 160, 168, 172,
+ 176, 177, 184, 104, 99, 98, 94, 94, 92, 92, 90, 92, 96, 98,
+ 102, 104, 109, 112, 116, 121, 125, 130, 135, 141, 144, 152, 155, 163,
+ 166, 177, 179, 184, 185, 191, 107, 101, 101, 97, 97, 95, 95, 93,
+ 93, 99, 99, 105, 105, 112, 112, 120, 120, 129, 129, 139, 140, 149,
+ 149, 161, 161, 172, 172, 185, 186, 191, 192, 199},
+ {32, 31, 31, 30, 31, 31, 30, 31, 31, 32, 33, 34, 35, 35, 39,
+ 35, 36, 37, 37, 41, 43, 36, 38, 39, 40, 43, 45, 47, 41, 42,
+ 42, 42, 45, 46, 47, 48, 44, 44, 44, 44, 46, 46, 47, 49, 50,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 48, 47, 46, 45, 46,
+ 46, 46, 49, 51, 53, 54, 48, 47, 46, 45, 46, 46, 46, 49, 51,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+ 58, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61,
+ 51, 48, 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62,
+ 52, 50, 48, 47, 47, 47, 47, 50, 52, 54, 57, 58, 61, 63, 64,
+ 66, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 64,
+ 65, 68, 70, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60,
+ 62, 65, 66, 68, 70, 71, 57, 54, 53, 52, 51, 50, 50, 53, 54,
+ 56, 60, 61, 63, 66, 67, 70, 73, 73, 76, 59, 56, 54, 53, 53,
+ 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80,
+ 60, 57, 55, 54, 53, 53, 52, 55, 56, 58, 61, 63, 65, 68, 69,
+ 72, 75, 76, 79, 81, 82, 63, 60, 58, 57, 56, 55, 54, 57, 59,
+ 60, 63, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 64, 61,
+ 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78,
+ 79, 82, 85, 86, 89, 90, 65, 61, 60, 58, 57, 56, 55, 58, 59,
+ 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85, 86, 90, 91, 91,
+ 67, 63, 61, 60, 59, 58, 57, 60, 61, 63, 65, 66, 69, 72, 73,
+ 77, 79, 80, 84, 86, 88, 92, 93, 93, 95, 68, 64, 63, 61, 60,
+ 59, 58, 60, 61, 63, 65, 67, 70, 71, 74, 76, 78, 81, 83, 86,
+ 88, 89, 94, 94, 95, 97, 68, 65, 64, 62, 61, 60, 58, 59, 61,
+ 64, 64, 68, 69, 71, 74, 75, 79, 80, 83, 86, 87, 91, 92, 95,
+ 96, 97, 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67,
+ 69, 72, 72, 76, 78, 80, 83, 84, 88, 89, 92, 94, 97, 98, 99,
+ 101, 70, 67, 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71,
+ 73, 76, 77, 81, 82, 85, 86, 90, 91, 94, 96, 99, 100, 100, 103,
+ 71, 67, 67, 64, 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74,
+ 74, 78, 79, 83, 84, 87, 89, 91, 94, 95, 99, 100, 102, 102, 104,
+ 72, 68, 68, 65, 65, 64, 63, 61, 62, 65, 66, 68, 69, 71, 73,
+ 75, 77, 79, 82, 84, 87, 88, 92, 93, 96, 97, 101, 102, 104, 104,
+ 106, 73, 69, 69, 66, 66, 64, 64, 62, 62, 66, 66, 69, 69, 72,
+ 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99, 99, 104, 104,
+ 106, 106, 108}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 33, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 34, 35,
+ 32, 33, 33, 33, 34, 34, 36, 36, 34, 34, 34, 33, 35, 35,
+ 37, 38, 39, 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 36,
+ 35, 35, 34, 36, 36, 38, 40, 42, 47, 48, 39, 38, 38, 37,
+ 39, 39, 40, 42, 45, 49, 50, 54, 41, 40, 39, 38, 40, 40,
+ 41, 43, 46, 50, 52, 55, 57, 44, 42, 42, 41, 42, 42, 42,
+ 44, 47, 52, 54, 58, 60, 63, 47, 45, 45, 44, 44, 45, 45,
+ 47, 50, 55, 56, 60, 62, 66, 69, 48, 46, 45, 44, 45, 45,
+ 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 54, 51, 50, 49,
+ 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 56,
+ 53, 52, 51, 51, 51, 51, 53, 56, 60, 61, 66, 69, 73, 77,
+ 78, 84, 86, 59, 56, 55, 54, 54, 54, 53, 55, 58, 62, 64,
+ 69, 71, 75, 79, 80, 87, 89, 92, 64, 61, 60, 58, 58, 58,
+ 57, 59, 62, 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102,
+ 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 75, 79,
+ 84, 85, 92, 94, 98, 103, 105, 71, 68, 67, 65, 64, 64, 63,
+ 65, 68, 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111,
+ 117, 74, 71, 69, 68, 67, 67, 65, 67, 70, 74, 75, 80, 83,
+ 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 80, 76, 74, 72,
+ 71, 71, 69, 71, 74, 78, 79, 84, 86, 90, 95, 96, 104, 106,
+ 110, 116, 118, 125, 128, 134, 82, 78, 76, 74, 73, 73, 71, 73,
+ 76, 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127,
+ 131, 136, 139, 83, 78, 77, 75, 74, 74, 72, 73, 76, 80, 81,
+ 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+ 140, 87, 83, 81, 79, 78, 78, 75, 77, 80, 83, 85, 90, 92,
+ 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142, 144, 145, 150,
+ 90, 85, 84, 81, 80, 80, 78, 78, 82, 84, 87, 91, 93, 98,
+ 99, 106, 108, 113, 118, 121, 129, 130, 137, 141, 147, 150, 151, 156,
+ 92, 88, 87, 84, 83, 82, 80, 80, 84, 85, 90, 91, 95, 98,
+ 102, 106, 109, 115, 117, 125, 126, 134, 137, 142, 148, 152, 155, 156,
+ 162, 95, 90, 89, 86, 85, 84, 83, 82, 85, 87, 91, 92, 97,
+ 98, 105, 105, 112, 114, 121, 123, 129, 133, 138, 143, 147, 155, 158,
+ 161, 162, 168, 97, 92, 92, 88, 88, 86, 86, 84, 85, 90, 91,
+ 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133, 140, 143, 150,
+ 152, 162, 164, 168, 168, 174, 100, 95, 95, 90, 90, 89, 89, 86,
+ 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129, 137,
+ 137, 147, 148, 157, 158, 169, 170, 174, 175, 181},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 33, 34, 34, 34, 37,
+ 33, 34, 35, 35, 38, 39, 36, 38, 39, 40, 42, 43, 47, 38, 40,
+ 40, 41, 43, 44, 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 49, 47, 47, 46, 47,
+ 47, 48, 49, 50, 52, 53, 48, 47, 46, 45, 46, 46, 46, 48, 49,
+ 52, 53, 54, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+ 55, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61,
+ 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 58, 60, 61,
+ 61, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+ 63, 63, 66, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58,
+ 59, 62, 64, 64, 67, 68, 54, 52, 51, 49, 49, 49, 48, 49, 52,
+ 55, 55, 58, 60, 62, 64, 65, 68, 69, 71, 56, 54, 53, 51, 51,
+ 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75,
+ 57, 54, 53, 52, 51, 51, 50, 51, 53, 56, 56, 60, 61, 63, 66,
+ 67, 70, 71, 73, 76, 76, 60, 57, 56, 54, 53, 53, 52, 53, 55,
+ 58, 58, 61, 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 61, 58,
+ 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69, 73,
+ 74, 76, 79, 80, 83, 84, 63, 60, 59, 57, 56, 56, 54, 55, 57,
+ 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89,
+ 64, 61, 60, 58, 57, 57, 55, 56, 58, 60, 61, 64, 66, 68, 70,
+ 71, 75, 77, 79, 82, 82, 86, 87, 90, 91, 65, 61, 60, 58, 57,
+ 57, 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82,
+ 83, 86, 88, 90, 91, 91, 67, 63, 62, 60, 59, 59, 57, 58, 60,
+ 62, 63, 66, 67, 69, 72, 73, 77, 78, 80, 83, 84, 88, 89, 92,
+ 93, 93, 95, 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65,
+ 67, 70, 70, 74, 75, 78, 80, 81, 85, 86, 89, 91, 93, 94, 95,
+ 97, 68, 65, 64, 62, 61, 60, 59, 58, 61, 61, 64, 65, 67, 69,
+ 71, 73, 75, 78, 79, 83, 83, 87, 88, 91, 93, 95, 96, 97, 99,
+ 69, 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72,
+ 72, 76, 76, 80, 81, 84, 86, 88, 90, 92, 95, 96, 98, 98, 100,
+ 70, 66, 66, 63, 63, 62, 61, 60, 60, 63, 64, 66, 67, 69, 71,
+ 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98, 100, 100,
+ 102, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67, 70,
+ 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100,
+ 101, 101, 104}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 33, 33, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 33, 34, 35, 35, 33, 33, 33, 33, 34, 35,
+ 36, 36, 38, 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 36,
+ 35, 35, 34, 35, 36, 37, 38, 42, 42, 48, 36, 35, 35, 34,
+ 35, 36, 38, 38, 42, 43, 48, 49, 39, 38, 38, 37, 38, 39,
+ 40, 40, 44, 45, 50, 51, 54, 41, 39, 39, 38, 39, 40, 40,
+ 41, 45, 46, 51, 52, 55, 56, 44, 42, 42, 41, 41, 42, 42,
+ 42, 46, 47, 54, 54, 58, 59, 63, 46, 44, 44, 42, 43, 44,
+ 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 48, 46, 46, 44,
+ 45, 45, 45, 46, 50, 51, 57, 57, 61, 63, 67, 69, 71, 52,
+ 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59, 64, 65, 70,
+ 72, 74, 78, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60,
+ 60, 65, 67, 71, 74, 76, 80, 82, 58, 56, 55, 53, 53, 53,
+ 53, 53, 57, 58, 63, 64, 68, 70, 75, 77, 80, 84, 86, 91,
+ 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70,
+ 75, 78, 80, 85, 87, 91, 92, 65, 62, 61, 59, 59, 59, 58,
+ 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92, 97, 98,
+ 105, 66, 63, 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74,
+ 76, 80, 83, 86, 91, 93, 98, 99, 106, 107, 71, 68, 67, 65,
+ 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95,
+ 97, 103, 103, 111, 112, 117, 74, 71, 70, 68, 67, 67, 66, 65,
+ 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105, 106, 114,
+ 115, 120, 123, 80, 76, 75, 72, 72, 71, 70, 69, 73, 74, 79,
+ 79, 84, 86, 90, 93, 96, 101, 104, 110, 110, 118, 119, 125, 128,
+ 134, 81, 77, 77, 74, 73, 73, 71, 71, 74, 75, 80, 80, 85,
+ 87, 91, 94, 98, 103, 105, 111, 112, 120, 121, 127, 130, 136, 137,
+ 83, 78, 78, 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88,
+ 92, 95, 99, 104, 106, 112, 113, 121, 122, 128, 131, 137, 139, 140,
+ 86, 82, 81, 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91,
+ 95, 98, 101, 106, 109, 115, 116, 124, 125, 131, 135, 140, 142, 144,
+ 147, 89, 84, 84, 80, 80, 79, 78, 77, 79, 81, 85, 86, 91,
+ 92, 97, 98, 104, 106, 112, 114, 119, 123, 128, 132, 135, 142, 145,
+ 148, 149, 153, 91, 86, 86, 82, 82, 81, 80, 79, 80, 84, 85,
+ 88, 91, 94, 97, 100, 104, 107, 112, 115, 120, 123, 129, 132, 138,
+ 140, 148, 150, 153, 154, 159, 93, 88, 88, 84, 84, 83, 83, 80,
+ 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118, 119, 126,
+ 126, 135, 136, 144, 144, 155, 155, 159, 159, 164},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 31, 32, 32, 33, 34, 33, 34,
+ 35, 35, 37, 39, 35, 37, 37, 38, 39, 41, 44, 36, 38, 39, 40, 41, 43,
+ 46, 47, 40, 41, 41, 42, 43, 44, 46, 47, 48, 41, 42, 42, 42, 43, 45,
+ 46, 47, 48, 48, 49, 47, 47, 46, 46, 47, 47, 48, 50, 50, 53, 49, 47,
+ 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 48, 47, 47, 45, 46, 46, 46,
+ 46, 49, 49, 53, 53, 54, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53,
+ 53, 54, 55, 49, 47, 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56,
+ 58, 50, 47, 47, 45, 46, 46, 46, 46, 49, 49, 54, 54, 56, 57, 59, 60,
+ 50, 48, 48, 46, 46, 46, 46, 46, 49, 50, 54, 54, 56, 57, 60, 60, 61,
+ 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62, 63,
+ 65, 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62,
+ 63, 65, 66, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55, 55, 58, 59,
+ 62, 63, 65, 67, 68, 70, 54, 52, 51, 49, 49, 49, 48, 48, 51, 52, 55,
+ 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 57, 54, 54, 52, 51, 51, 50,
+ 50, 52, 53, 56, 57, 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 57, 55,
+ 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65, 67, 70, 71,
+ 73, 74, 77, 77, 60, 57, 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61,
+ 63, 65, 67, 68, 71, 72, 75, 75, 79, 79, 82, 61, 58, 57, 55, 55, 54,
+ 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80,
+ 83, 84, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61, 63, 65, 67,
+ 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 64, 61, 60, 58, 57, 57,
+ 56, 55, 57, 58, 61, 61, 64, 65, 68, 69, 71, 74, 75, 78, 78, 82, 83,
+ 86, 87, 89, 90, 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64,
+ 65, 68, 70, 71, 74, 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 66, 63,
+ 62, 60, 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76,
+ 79, 80, 84, 84, 87, 89, 91, 92, 93, 94, 67, 64, 63, 61, 60, 59, 58,
+ 57, 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87,
+ 89, 92, 93, 94, 94, 96, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62,
+ 64, 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94,
+ 96, 96, 98, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68,
+ 68, 71, 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97,
+ 99}},
+ {{32, 31, 32, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 34, 34, 35, 32, 32, 32, 32, 32, 34,
+ 34, 35, 35, 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 34,
+ 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 36, 35, 35, 34,
+ 34, 36, 36, 38, 38, 42, 42, 48, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39,
+ 40, 40, 45, 45, 50, 50, 54, 54, 44, 42, 42, 41, 41, 42,
+ 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 44, 42, 42, 41,
+ 41, 42, 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 48,
+ 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61,
+ 67, 67, 71, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51,
+ 57, 57, 61, 61, 67, 67, 71, 71, 54, 51, 51, 49, 49, 50,
+ 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 54, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65,
+ 65, 71, 71, 76, 76, 82, 82, 59, 56, 56, 54, 54, 54, 54,
+ 53, 53, 58, 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87,
+ 92, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64, 64,
+ 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 65, 62, 62, 59,
+ 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85,
+ 85, 92, 92, 98, 98, 105, 65, 62, 62, 59, 59, 59, 59, 58,
+ 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98,
+ 98, 105, 105, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68,
+ 73, 73, 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111,
+ 117, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73,
+ 78, 78, 84, 84, 90, 90, 97, 97, 103, 103, 111, 111, 117, 117,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 80, 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84,
+ 84, 90, 90, 96, 96, 104, 104, 110, 110, 118, 118, 125, 125, 134,
+ 134, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76, 81, 81,
+ 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121, 128, 128,
+ 137, 137, 140, 83, 78, 78, 75, 75, 74, 74, 72, 72, 76, 76,
+ 81, 81, 86, 86, 92, 92, 99, 99, 106, 106, 113, 113, 121, 121,
+ 128, 128, 137, 137, 140, 140, 87, 83, 83, 79, 79, 77, 77, 75,
+ 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109, 109, 116,
+ 116, 124, 124, 132, 132, 141, 141, 144, 144, 149},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 32, 30, 31, 31, 32, 32, 33, 34,
+ 34, 35, 35, 39, 33, 34, 34, 35, 35, 39, 39, 36, 38, 38, 40, 40, 43,
+ 43, 47, 36, 38, 38, 40, 40, 43, 43, 47, 47, 41, 42, 42, 42, 42, 45,
+ 45, 47, 47, 48, 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 49, 47,
+ 47, 46, 46, 47, 47, 48, 48, 50, 50, 53, 49, 47, 47, 46, 46, 47, 47,
+ 48, 48, 50, 50, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49,
+ 53, 53, 54, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54,
+ 54, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60,
+ 61, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60,
+ 60, 61, 61, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50, 54, 54, 57,
+ 57, 61, 61, 63, 63, 66, 52, 50, 50, 47, 47, 47, 47, 47, 47, 50, 50,
+ 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 54, 52, 52, 49, 49, 49, 49,
+ 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 54, 52,
+ 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62, 62, 65, 65,
+ 68, 68, 71, 71, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56,
+ 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73,
+ 76, 76, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61, 61,
+ 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 60, 57, 57, 54, 54, 53,
+ 53, 52, 52, 55, 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75,
+ 79, 79, 82, 82, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60,
+ 63, 63, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 63, 60,
+ 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71,
+ 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 65, 61, 61, 58, 58, 57, 57,
+ 55, 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83,
+ 83, 86, 86, 90, 90, 91, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58,
+ 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90,
+ 90, 91, 91, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66,
+ 66, 69, 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93,
+ 95}},
+ {{32, 31, 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 32, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 33, 33, 34, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 34,
+ 34, 34, 33, 33, 34, 35, 35, 37, 37, 39, 34, 34, 34, 33,
+ 33, 34, 35, 35, 37, 37, 39, 39, 35, 35, 35, 34, 34, 35,
+ 36, 36, 38, 38, 42, 42, 46, 36, 35, 35, 34, 34, 35, 36,
+ 37, 38, 38, 42, 42, 47, 48, 38, 37, 37, 36, 36, 37, 38,
+ 38, 39, 40, 44, 44, 48, 50, 51, 39, 38, 38, 38, 37, 38,
+ 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 41, 40, 40, 39,
+ 38, 39, 40, 40, 41, 41, 46, 46, 50, 52, 54, 55, 57, 44,
+ 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54, 56,
+ 58, 60, 63, 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48,
+ 48, 53, 54, 57, 58, 60, 64, 65, 48, 46, 46, 45, 44, 45,
+ 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71,
+ 48, 46, 46, 45, 44, 45, 45, 45, 46, 46, 51, 51, 55, 57,
+ 59, 61, 63, 67, 68, 71, 71, 53, 51, 51, 49, 49, 49, 49,
+ 49, 49, 49, 54, 54, 58, 59, 62, 64, 67, 71, 72, 75, 75,
+ 81, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59,
+ 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 57, 55, 55, 53,
+ 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+ 75, 79, 79, 85, 85, 89, 59, 56, 56, 54, 54, 54, 54, 54,
+ 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80, 86,
+ 87, 90, 92, 62, 59, 59, 57, 56, 56, 56, 56, 55, 56, 60,
+ 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93, 95,
+ 98, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63, 67,
+ 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105,
+ 67, 64, 64, 62, 61, 61, 60, 60, 59, 60, 64, 64, 68, 69,
+ 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106, 108,
+ 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68, 72, 73,
+ 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+ 117, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+ 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111,
+ 113, 118, 119, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74,
+ 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110,
+ 114, 118, 120, 125, 126, 134, 80, 76, 76, 73, 72, 72, 71, 70,
+ 69, 70, 74, 74, 78, 79, 82, 84, 86, 90, 91, 96, 96, 103,
+ 104, 108, 110, 114, 118, 120, 125, 126, 134, 134},
+ {32, 31, 31, 31, 31, 31, 30, 31, 31, 31, 30, 31, 31, 31, 32, 32, 32,
+ 33, 33, 33, 35, 33, 34, 34, 35, 35, 37, 39, 34, 35, 35, 36, 36, 38,
+ 40, 41, 36, 38, 38, 39, 40, 41, 43, 44, 47, 37, 38, 39, 40, 40, 42,
+ 43, 44, 47, 47, 41, 42, 42, 42, 42, 43, 45, 45, 47, 47, 48, 41, 42,
+ 42, 42, 42, 43, 45, 45, 47, 47, 48, 48, 47, 46, 46, 46, 45, 46, 47,
+ 47, 47, 48, 50, 50, 52, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50,
+ 50, 52, 53, 49, 47, 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53,
+ 53, 48, 47, 47, 46, 45, 46, 46, 46, 46, 47, 49, 49, 52, 53, 54, 54,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 52, 53, 54, 55, 55,
+ 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53, 55, 55, 57,
+ 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56,
+ 57, 59, 59, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50, 50, 53, 54,
+ 55, 56, 58, 60, 60, 61, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 50,
+ 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 52, 50, 49, 48, 47, 47, 47,
+ 47, 46, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 52, 50,
+ 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 61,
+ 63, 63, 66, 66, 54, 51, 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54,
+ 55, 57, 58, 60, 62, 62, 65, 65, 67, 68, 69, 54, 52, 52, 50, 49, 49,
+ 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68,
+ 70, 71, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49, 52, 52, 55, 56, 58,
+ 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 57, 54, 54, 52, 52, 51,
+ 51, 51, 50, 50, 53, 53, 56, 56, 58, 60, 61, 63, 64, 67, 67, 70, 70,
+ 72, 73, 75, 76, 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56,
+ 57, 59, 60, 62, 64, 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 60, 57,
+ 57, 55, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66,
+ 68, 68, 72, 72, 74, 75, 77, 79, 80, 82, 60, 57, 57, 55, 54, 54, 54,
+ 53, 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75,
+ 76, 77, 79, 80, 82, 82, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57,
+ 57, 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83,
+ 85, 85, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60,
+ 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89,
+ 89}},
+ {{32, 31, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 32,
+ 32, 33, 34, 34, 35, 35, 33, 33, 33, 33, 33, 33, 34, 35, 35,
+ 36, 36, 38, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+ 39, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48,
+ 48, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+ 50, 50, 52, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44,
+ 45, 47, 50, 50, 53, 54, 41, 40, 40, 39, 38, 38, 40, 40, 40,
+ 41, 41, 45, 46, 48, 52, 52, 54, 55, 57, 44, 42, 42, 42, 41,
+ 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63,
+ 44, 42, 42, 42, 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54,
+ 54, 57, 58, 60, 63, 63, 47, 46, 45, 45, 44, 44, 44, 45, 45,
+ 45, 45, 49, 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 48, 47,
+ 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57, 60,
+ 61, 63, 67, 67, 70, 71, 50, 49, 48, 47, 46, 46, 47, 47, 47,
+ 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68, 68, 72, 73, 75,
+ 54, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53, 54, 56, 60,
+ 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 54, 52, 51, 50, 49,
+ 49, 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71,
+ 71, 75, 76, 78, 82, 82, 58, 56, 55, 54, 53, 53, 53, 53, 53,
+ 52, 52, 56, 57, 59, 63, 63, 67, 68, 70, 74, 74, 78, 79, 82,
+ 86, 86, 90, 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57,
+ 58, 60, 64, 64, 68, 69, 71, 75, 75, 79, 80, 83, 87, 87, 91,
+ 92, 61, 59, 58, 57, 56, 56, 56, 56, 55, 55, 55, 59, 60, 62,
+ 65, 65, 69, 70, 73, 77, 77, 81, 82, 85, 89, 89, 93, 94, 97,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 65, 63, 62, 61, 59, 59, 59, 59, 59, 58, 58, 62, 63, 65, 68,
+ 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98, 101, 105,
+ 105, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67, 69,
+ 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105,
+ 109, 109, 114},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 32, 30, 31,
+ 31, 31, 32, 32, 33, 33, 34, 34, 34, 34, 37, 33, 34, 34, 35, 35, 35,
+ 38, 39, 34, 36, 36, 36, 37, 37, 40, 40, 42, 36, 38, 38, 39, 40, 40,
+ 42, 43, 45, 47, 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 40, 41,
+ 41, 41, 42, 42, 44, 44, 45, 47, 47, 48, 41, 42, 42, 42, 42, 42, 44,
+ 45, 46, 47, 47, 48, 48, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47,
+ 49, 49, 50, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51,
+ 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 54,
+ 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54,
+ 54, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53,
+ 54, 55, 55, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 48, 49, 51,
+ 53, 53, 55, 55, 57, 58, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45,
+ 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 50, 48, 48, 47, 46, 46, 46,
+ 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 50, 49,
+ 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54, 56, 56, 58,
+ 60, 60, 61, 61, 51, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50,
+ 51, 54, 54, 56, 57, 58, 60, 60, 62, 62, 63, 52, 50, 50, 49, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63,
+ 65, 66, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47, 47, 49, 50, 52, 54,
+ 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 54, 52, 51, 50, 49, 49,
+ 49, 49, 48, 48, 48, 51, 51, 53, 55, 55, 58, 58, 60, 62, 62, 64, 65,
+ 66, 68, 68, 70, 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52,
+ 53, 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 55, 53,
+ 53, 52, 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60,
+ 63, 63, 65, 66, 67, 69, 69, 71, 72, 73, 57, 55, 54, 53, 52, 52, 51,
+ 51, 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68,
+ 70, 70, 73, 73, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50,
+ 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73,
+ 74, 76, 76, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56,
+ 58, 58, 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78,
+ 80}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 32, 33, 33, 33, 33, 33, 33,
+ 34, 34, 35, 36, 36, 36, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37,
+ 37, 38, 39, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39,
+ 39, 35, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 39, 41, 41, 43,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45, 48,
+ 48, 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47,
+ 50, 50, 51, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 42, 45,
+ 45, 47, 50, 50, 52, 54, 39, 39, 38, 38, 37, 37, 38, 39, 39, 39, 40,
+ 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 42, 41, 41, 41, 40, 40, 40,
+ 41, 41, 41, 42, 42, 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 44, 43,
+ 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50, 54, 54, 56,
+ 58, 58, 61, 63, 44, 43, 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45,
+ 48, 48, 51, 54, 54, 56, 58, 58, 62, 64, 64, 47, 46, 45, 45, 44, 44,
+ 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66,
+ 66, 69, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45, 46, 46, 47, 51, 51,
+ 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 49, 48, 47, 47, 46, 45,
+ 45, 46, 46, 46, 46, 46, 48, 51, 51, 54, 57, 57, 60, 62, 62, 66, 68,
+ 68, 71, 72, 73, 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51,
+ 54, 54, 57, 59, 59, 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 54, 52,
+ 51, 51, 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63,
+ 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 55, 53, 53, 52, 51, 50, 50,
+ 51, 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73,
+ 76, 77, 78, 83, 83, 85, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53,
+ 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86,
+ 87, 88, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58,
+ 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92,
+ 92},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 31, 31, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35,
+ 35, 38, 33, 34, 34, 34, 35, 35, 36, 38, 39, 34, 35, 35, 36, 36, 36,
+ 37, 40, 40, 41, 36, 38, 38, 38, 39, 40, 40, 43, 43, 44, 47, 36, 38,
+ 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 38, 39, 40, 40, 41, 41, 41,
+ 43, 44, 45, 47, 47, 47, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48,
+ 48, 45, 45, 45, 45, 44, 44, 44, 46, 46, 46, 47, 47, 48, 49, 49, 50,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50, 50, 51, 53,
+ 53, 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51,
+ 53, 53, 53, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 49,
+ 49, 51, 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+ 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 49, 47, 47, 47, 45, 45, 45,
+ 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 49, 47,
+ 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55,
+ 55, 55, 57, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 51, 53, 53, 55, 56, 56, 58, 58, 59, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59,
+ 59, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50,
+ 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 51, 49, 48, 48, 47, 46,
+ 46, 47, 47, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60,
+ 60, 61, 62, 62, 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48,
+ 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 52, 50,
+ 50, 49, 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56,
+ 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 53, 51, 50, 50, 48, 48, 48,
+ 48, 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62,
+ 63, 64, 64, 67, 67, 68, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48,
+ 48, 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68,
+ 68, 69, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52,
+ 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71,
+ 71}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 35, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+ 36, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39,
+ 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39,
+ 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 38, 40, 40,
+ 41, 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42,
+ 42, 43, 46, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+ 40, 42, 42, 44, 47, 48, 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 38, 37, 37, 37, 36, 36, 36,
+ 36, 37, 38, 38, 39, 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 39, 39,
+ 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40,
+ 40, 42, 45, 45, 46, 49, 50, 50, 52, 54, 54, 41, 40, 40, 40, 39, 38,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55,
+ 55, 57, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44,
+ 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 44, 43, 42, 42, 42, 41,
+ 41, 41, 42, 42, 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58,
+ 58, 60, 63, 63, 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43,
+ 43, 45, 48, 48, 49, 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 47, 46,
+ 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55,
+ 56, 56, 58, 60, 60, 62, 66, 66, 67, 69, 48, 47, 46, 46, 45, 44, 44,
+ 45, 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61,
+ 63, 67, 67, 68, 70, 71, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45,
+ 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68,
+ 70, 71, 71, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48,
+ 50, 53, 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74,
+ 77},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31,
+ 31, 31, 31, 32, 30, 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 37, 33, 34, 34, 34, 35, 35,
+ 35, 36, 38, 39, 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 35, 36,
+ 37, 37, 37, 38, 38, 38, 41, 41, 41, 44, 36, 37, 38, 38, 39, 40, 40,
+ 40, 42, 43, 43, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 46, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48,
+ 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48,
+ 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47, 48, 49, 49,
+ 49, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+ 50, 50, 52, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 48, 48,
+ 49, 50, 50, 51, 52, 53, 49, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47,
+ 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46,
+ 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 48, 47,
+ 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52,
+ 53, 53, 54, 54, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+ 46, 48, 49, 49, 50, 52, 53, 53, 54, 54, 54, 49, 47, 47, 47, 46, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55,
+ 55, 55, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47,
+ 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 49, 47, 47, 47, 46, 45,
+ 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55,
+ 55, 57, 58, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 47, 49, 49, 50, 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 50, 49,
+ 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53,
+ 54, 54, 55, 56, 56, 57, 59, 59, 60, 61, 50, 49, 48, 48, 47, 46, 46,
+ 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56,
+ 58, 60, 60, 60, 61, 61, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46,
+ 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60,
+ 61, 61, 61, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46,
+ 48, 50, 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63,
+ 64}},
+ {{32, 31, 31, 31, 31, 32, 31, 31, 32, 32, 31, 31, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 36, 36, 36,
+ 37, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37,
+ 37, 38, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36,
+ 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 35, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42,
+ 42, 42, 45, 48, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36,
+ 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48,
+ 48, 48, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39,
+ 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 39, 39, 38, 38, 38, 38,
+ 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47, 50,
+ 50, 50, 52, 54, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39,
+ 40, 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 39, 39,
+ 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45,
+ 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 41, 41, 40, 40, 40, 39, 39,
+ 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52,
+ 52, 54, 56, 56, 56, 58, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42,
+ 42, 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58,
+ 58, 60, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42,
+ 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63,
+ 63},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 32, 30, 31, 31, 31, 31, 31,
+ 32, 32, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 35, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 33, 34, 34, 34, 34, 35, 35,
+ 35, 35, 37, 39, 39, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41,
+ 41, 41, 43, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+ 47, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47,
+ 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45, 47, 47, 47,
+ 47, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47,
+ 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46,
+ 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45,
+ 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44,
+ 44, 44, 45, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 49, 48,
+ 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50,
+ 50, 50, 51, 53, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+ 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53, 49, 48, 47, 47, 47, 47,
+ 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53,
+ 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47,
+ 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 48, 48, 47, 47, 47, 46,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51, 53,
+ 53, 53, 53, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46,
+ 46, 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 48, 48,
+ 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49,
+ 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 49, 48, 47, 47, 47, 46, 45,
+ 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53,
+ 53, 54, 55, 55, 55, 56, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55,
+ 55, 57, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58,
+ 58}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36,
+ 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39,
+ 39, 39, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35,
+ 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40,
+ 41, 41, 41, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 37, 37, 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 36, 35,
+ 35, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38,
+ 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 36, 35, 35, 35, 35, 35, 35,
+ 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42,
+ 42, 42, 45, 47, 48, 48, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34,
+ 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47,
+ 48, 48, 48, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37,
+ 37, 37, 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49,
+ 50},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 32, 30, 30, 31, 31, 31, 31,
+ 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 36, 37, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35,
+ 37, 38, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39,
+ 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 34, 35, 36, 36, 36, 36, 36, 37, 37, 37, 37, 38, 40, 40, 40, 40, 42,
+ 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42, 42, 42, 44,
+ 46, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 45, 46, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+ 43, 43, 45, 46, 47, 47, 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40,
+ 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40,
+ 41, 41, 41, 41, 42, 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 40, 41,
+ 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47,
+ 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44,
+ 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 41, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48,
+ 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45,
+ 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 44, 44, 44, 44, 44, 44,
+ 44, 44, 44, 44, 44, 44, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49,
+ 49, 49, 49, 50, 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 49, 48,
+ 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48,
+ 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 49, 48, 48, 47, 47, 47, 47,
+ 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+ 50, 50, 51, 52, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+ 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52,
+ 53, 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47,
+ 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53,
+ 53}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 36, 36, 36, 36, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33,
+ 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37,
+ 38, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33,
+ 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39,
+ 39},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38,
+ 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37,
+ 38, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 34, 35, 35, 35, 35, 35, 35,
+ 36, 36, 36, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 35, 36,
+ 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39, 41, 41, 41,
+ 41, 41, 42, 44, 36, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39,
+ 39, 40, 41, 42, 43, 43, 43, 43, 44, 45, 46, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40,
+ 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46,
+ 47, 47, 47, 47, 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40,
+ 40, 41, 42, 43, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 38, 39,
+ 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44,
+ 44, 44, 45, 46, 47, 47, 47, 47, 47, 47, 40, 40, 40, 41, 41, 41, 41,
+ 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47,
+ 47, 47, 47, 47, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47,
+ 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48,
+ 48}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 31, 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 34, 34, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 37, 37, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+ 37, 38, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 33, 33,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37,
+ 38, 39, 39, 39, 39, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 39, 39, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+ 36, 36, 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40,
+ 40}},
+ {{32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32},
+ {32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32}}};
diff --git a/libgav1/src/reconstruction.cc b/libgav1/src/reconstruction.cc
index 97de9f0..1aa1233 100644
--- a/libgav1/src/reconstruction.cc
+++ b/libgav1/src/reconstruction.cc
@@ -14,6 +14,7 @@
#include "src/reconstruction.h"
+#include <algorithm>
#include <cassert>
#include <cstdint>
@@ -48,6 +49,84 @@
return static_cast<dsp::TransformSize1D>(size_log2 - 2);
}
+// Returns the number of rows to process based on |non_zero_coeff_count|. The
+// transform loops process either 4 or a multiple of 8 rows. Use the
+// TransformClass derived from |tx_type| to determine the scan order.
+template <int tx_width>
+int GetNumRows(TransformType tx_type, int tx_height, int non_zero_coeff_count) {
+ const TransformClass tx_class = GetTransformClass(tx_type);
+
+ switch (tx_class) {
+ case kTransformClass2D:
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 13) return 4;
+ if (non_zero_coeff_count <= 29) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 8)) return 4;
+ if (non_zero_coeff_count <= 43) return 8;
+ if ((non_zero_coeff_count <= 107) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 171) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if ((non_zero_coeff_count <= 14) & (tx_height > 16)) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 44) & (tx_height > 16)) return 8;
+ if ((non_zero_coeff_count <= 151) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 279) & (tx_height > 16)) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 10) return 4;
+ if (non_zero_coeff_count <= 36) return 8;
+ if ((non_zero_coeff_count <= 136) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 300) & (tx_height > 16)) return 24;
+ }
+ break;
+
+ case kTransformClassHorizontal:
+ if (non_zero_coeff_count <= 4) return 4;
+ if (non_zero_coeff_count <= 8) return 8;
+ if ((non_zero_coeff_count <= 16) & (tx_height > 16)) return 16;
+ if ((non_zero_coeff_count <= 24) & (tx_height > 16)) return 24;
+ break;
+
+ default:
+ assert(tx_class == kTransformClassVertical);
+ if (tx_width == 4) {
+ if (non_zero_coeff_count <= 16) return 4;
+ if (non_zero_coeff_count <= 32) return 8;
+ }
+ if (tx_width == 8) {
+ if (non_zero_coeff_count <= 32) return 4;
+ if (non_zero_coeff_count <= 64) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 8x8: 63, 8x16: 127.
+ if (non_zero_coeff_count <= 128) return 16;
+ if (non_zero_coeff_count <= 192) return 24;
+ }
+ if (tx_width == 16) {
+ if (non_zero_coeff_count <= 64) return 4;
+ if (non_zero_coeff_count <= 128) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 16x8: 127, 16x16: 255.
+ if (non_zero_coeff_count <= 256) return 16;
+ if (non_zero_coeff_count <= 384) return 24;
+ }
+ if (tx_width == 32) {
+ if (non_zero_coeff_count <= 128) return 4;
+ if (non_zero_coeff_count <= 256) return 8;
+ // There's no need to check tx_height since the maximum values for
+ // smaller sizes are: 32x8 is 255, 32x16 is 511.
+ if ((non_zero_coeff_count <= 512)) return 16;
+ if ((non_zero_coeff_count <= 768)) return 24;
+ }
+ break;
+ }
+ return (tx_width >= 16) ? std::min(tx_height, 32) : tx_height;
+}
+
} // namespace
template <typename Residual, typename Pixel>
@@ -59,17 +138,28 @@
const int tx_width_log2 = kTransformWidthLog2[tx_size];
const int tx_height_log2 = kTransformHeightLog2[tx_size];
+ int tx_height = (non_zero_coeff_count == 1) ? 1 : kTransformHeight[tx_size];
+ if (tx_height > 4) {
+ static constexpr int (*kGetNumRows[])(TransformType tx_type, int tx_height,
+ int non_zero_coeff_count) = {
+ &GetNumRows<4>, &GetNumRows<8>, &GetNumRows<16>, &GetNumRows<32>,
+ &GetNumRows<32>};
+ tx_height = kGetNumRows[tx_width_log2 - 2](tx_type, tx_height,
+ non_zero_coeff_count);
+ }
+ assert(tx_height <= 32);
+
// Row transform.
const dsp::TransformSize1D row_transform_size =
Get1DTransformSize(tx_width_log2);
const dsp::Transform1D row_transform =
lossless ? dsp::k1DTransformWht : kRowTransform[tx_type];
const dsp::InverseTransformAddFunc row_transform_func =
- dsp.inverse_transforms[row_transform_size][row_transform];
+ dsp.inverse_transforms[row_transform][row_transform_size][dsp::kRow];
assert(row_transform_func != nullptr);
- row_transform_func(tx_type, tx_size, buffer, start_x, start_y, frame,
- /*is_row=*/true, non_zero_coeff_count);
+ row_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
// Column transform.
const dsp::TransformSize1D column_transform_size =
@@ -77,11 +167,12 @@
const dsp::Transform1D column_transform =
lossless ? dsp::k1DTransformWht : kColumnTransform[tx_type];
const dsp::InverseTransformAddFunc column_transform_func =
- dsp.inverse_transforms[column_transform_size][column_transform];
+ dsp.inverse_transforms[column_transform][column_transform_size]
+ [dsp::kColumn];
assert(column_transform_func != nullptr);
- column_transform_func(tx_type, tx_size, buffer, start_x, start_y, frame,
- /*is_row=*/false, non_zero_coeff_count);
+ column_transform_func(tx_type, tx_size, tx_height, buffer, start_x, start_y,
+ frame);
}
template void Reconstruct(const dsp::Dsp& dsp, TransformType tx_type,
diff --git a/libgav1/src/residual_buffer_pool.cc b/libgav1/src/residual_buffer_pool.cc
index e166392..44a842c 100644
--- a/libgav1/src/residual_buffer_pool.cc
+++ b/libgav1/src/residual_buffer_pool.cc
@@ -129,7 +129,8 @@
}
void ResidualBufferPool::Release(std::unique_ptr<ResidualBuffer> buffer) {
- buffer->transform_parameters()->Reset();
+ buffer->transform_parameters()->Clear();
+ buffer->partition_tree_order()->Clear();
std::lock_guard<std::mutex> lock(mutex_);
buffers_.Push(std::move(buffer));
}
diff --git a/libgav1/src/residual_buffer_pool.h b/libgav1/src/residual_buffer_pool.h
index f7bc75d..75924db 100644
--- a/libgav1/src/residual_buffer_pool.h
+++ b/libgav1/src/residual_buffer_pool.h
@@ -27,73 +27,11 @@
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
#include "src/utils/memory.h"
+#include "src/utils/queue.h"
#include "src/utils/types.h"
namespace libgav1 {
-// A simple fixed size queue implementation to hold the transform parameters
-// when |Tile::split_parse_and_decode_| is true. We don't have to do any
-// boundary checks since we always push data into the queue before accessing it.
-class TransformParameterQueue {
- public:
- TransformParameterQueue() = default;
-
- // Move only.
- TransformParameterQueue(TransformParameterQueue&& other) = default;
- TransformParameterQueue& operator=(TransformParameterQueue&& other) = default;
-
- LIBGAV1_MUST_USE_RESULT bool Init(int max_size) {
- max_size_ = max_size;
- // No initialization is necessary since the data will be always written to
- // before being read.
- non_zero_coeff_count_.reset(new (std::nothrow) int16_t[max_size_]);
- tx_type_.reset(new (std::nothrow) TransformType[max_size_]);
- return non_zero_coeff_count_ != nullptr && tx_type_ != nullptr;
- }
-
- // Adds the |non_zero_coeff_count| and the |tx_type| to the back of the queue.
- void Push(int non_zero_coeff_count, TransformType tx_type) {
- assert(back_ < max_size_);
- non_zero_coeff_count_[back_] = non_zero_coeff_count;
- tx_type_[back_++] = tx_type;
- }
-
- // Returns the non_zero_coeff_count at the front of the queue.
- int16_t NonZeroCoeffCount() const {
- assert(front_ != back_);
- return non_zero_coeff_count_[front_];
- }
-
- // Returns the tx_type at the front of the queue.
- TransformType Type() const {
- assert(front_ != back_);
- return tx_type_[front_];
- }
-
- // Removes the |non_zero_coeff_count| and the |tx_type| from the front of the
- // queue.
- void Pop() {
- assert(front_ != back_);
- ++front_;
- }
-
- // Clears the queue.
- void Reset() {
- front_ = 0;
- back_ = 0;
- }
-
- // Used only in the tests. Returns the number of elements in the queue.
- int Size() const { return back_ - front_; }
-
- private:
- int max_size_ = 0;
- std::unique_ptr<int16_t[]> non_zero_coeff_count_;
- std::unique_ptr<TransformType[]> tx_type_;
- int front_ = 0;
- int back_ = 0;
-};
-
// This class is used for parsing and decoding a superblock. Members of this
// class are populated in the "parse" step and consumed in the "decode" step.
class ResidualBuffer : public Allocable {
@@ -104,7 +42,8 @@
if (buffer != nullptr) {
buffer->buffer_ = MakeAlignedUniquePtr<uint8_t>(32, buffer_size);
if (buffer->buffer_ == nullptr ||
- !buffer->transform_parameters_.Init(queue_size)) {
+ !buffer->transform_parameters_.Init(queue_size) ||
+ !buffer->partition_tree_order_.Init(queue_size)) {
buffer = nullptr;
}
}
@@ -118,9 +57,14 @@
// Buffer used to store the residual values.
uint8_t* buffer() { return buffer_.get(); }
// Queue used to store the transform parameters.
- TransformParameterQueue* transform_parameters() {
+ Queue<TransformParameters>* transform_parameters() {
return &transform_parameters_;
}
+ // Queue used to store the block ordering in the partition tree of the
+ // superblocks.
+ Queue<PartitionTreeNode>* partition_tree_order() {
+ return &partition_tree_order_;
+ }
private:
friend class ResidualBufferStack;
@@ -128,7 +72,8 @@
ResidualBuffer() = default;
AlignedUniquePtr<uint8_t> buffer_;
- TransformParameterQueue transform_parameters_;
+ Queue<TransformParameters> transform_parameters_;
+ Queue<PartitionTreeNode> partition_tree_order_;
// Used by ResidualBufferStack to form a chain of ResidualBuffers.
ResidualBuffer* next_ = nullptr;
};
diff --git a/libgav1/src/symbol_decoder_context.cc b/libgav1/src/symbol_decoder_context.cc
index 159f25c..26a281e 100644
--- a/libgav1/src/symbol_decoder_context.cc
+++ b/libgav1/src/symbol_decoder_context.cc
@@ -319,20 +319,4 @@
}
}
-int SymbolDecoderContext::TxTypeIndex(TransformSet tx_set) {
- assert(tx_set != kTransformSetDctOnly);
- switch (tx_set) {
- case kTransformSetInter1:
- case kTransformSetIntra1:
- return 0;
- case kTransformSetInter2:
- case kTransformSetIntra2:
- return 1;
- case kTransformSetInter3:
- return 2;
- default:
- return -1;
- }
-}
-
} // namespace libgav1
diff --git a/libgav1/src/symbol_decoder_context.h b/libgav1/src/symbol_decoder_context.h
index 8713f5b..1bea76c 100644
--- a/libgav1/src/symbol_decoder_context.h
+++ b/libgav1/src/symbol_decoder_context.h
@@ -17,10 +17,12 @@
#ifndef LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
#define LIBGAV1_SRC_SYMBOL_DECODER_CONTEXT_H_
+#include <cassert>
#include <cstdint>
#include "src/dsp/constants.h"
#include "src/utils/constants.h"
+#include "src/utils/memory.h"
namespace libgav1 {
@@ -101,7 +103,21 @@
// Returns the cdf array index for inter_tx_type or intra_tx_type based on
// |tx_set|.
- static int TxTypeIndex(TransformSet tx_set);
+ static int TxTypeIndex(TransformSet tx_set) {
+ assert(tx_set != kTransformSetDctOnly);
+ switch (tx_set) {
+ case kTransformSetInter1:
+ case kTransformSetIntra1:
+ return 0;
+ case kTransformSetInter2:
+ case kTransformSetIntra2:
+ return 1;
+ case kTransformSetInter3:
+ return 2;
+ default:
+ return -1;
+ }
+ }
// Resets the intra_frame_y_mode_cdf array to the default.
void ResetIntraFrameYModeCdf();
@@ -110,117 +126,175 @@
// the last used element in the innermost dimension of each of the CDF array.
void ResetCounters();
- uint16_t partition_cdf[kBlockWidthCount][kPartitionContexts]
- [kMaxPartitionTypes + 1];
- uint16_t segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
- uint16_t use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
- [kBooleanFieldCdfSize];
- uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
- uint16_t skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
- uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
- uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
- uint16_t delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
- uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
- uint16_t intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
- [kIntraPredictionModesY + 1];
- uint16_t y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
- uint16_t angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
- uint16_t uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
- [kIntraPredictionModesUV + 1];
- uint16_t cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
- uint16_t cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
- uint16_t use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
- uint16_t filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
- uint16_t tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
- uint16_t tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
- uint16_t all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+ // Note kMaxAlignment allows for aligned instructions to be used in the
+ // copies done in Initialize().
+ alignas(kMaxAlignment) uint16_t
+ partition_cdf[kBlockWidthCount][kPartitionContexts]
+ [kMaxPartitionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ segment_id_cdf[kSegmentIdContexts][kMaxSegments + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_predicted_segment_id_cdf[kUsePredictedSegmentIdContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t skip_cdf[kSkipContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ skip_mode_cdf[kSkipModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t delta_q_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t delta_lf_cdf[kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ delta_lf_multi_cdf[kFrameLfCount][kDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t intra_block_copy_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ intra_frame_y_mode_cdf[kIntraModeContexts][kIntraModeContexts]
+ [kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ y_mode_cdf[kYModeContexts][kIntraPredictionModesY + 1];
+ alignas(kMaxAlignment) uint16_t
+ angle_delta_cdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ uv_mode_cdf[kBooleanSymbolCount][kIntraPredictionModesY]
+ [kIntraPredictionModesUV + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_signs_cdf[kCflAlphaSignsSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ cfl_alpha_cdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_filter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ filter_intra_mode_cdf[kNumFilterIntraPredictors + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_depth_cdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ tx_split_cdf[kTxSplitContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ all_zero_cdf[kNumSquareTransformSizes][kAllZeroContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_tx_type_cdf[3][kNumExtendedTransformSizes][kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ intra_tx_type_cdf[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
+ [kNumTransformTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt16SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt32SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt64SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt128SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts][kEobPt256SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes][kEobExtraContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseEobContexts][kCoeffBaseEobSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
+ [kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ restoration_type_cdf[kRestorationTypeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
[kBooleanFieldCdfSize];
- uint16_t inter_tx_type_cdf[3][kNumExtendedTransformSizes]
- [kNumTransformTypes + 1];
- uint16_t intra_tx_type_cdf[2][kNumExtendedTransformSizes]
- [kIntraPredictionModesY][kNumTransformTypes + 1];
- uint16_t eob_pt_16_cdf[kNumPlaneTypes][kEobPtContexts]
- [kEobPt16SymbolCount + 1];
- uint16_t eob_pt_32_cdf[kNumPlaneTypes][kEobPtContexts]
- [kEobPt32SymbolCount + 1];
- uint16_t eob_pt_64_cdf[kNumPlaneTypes][kEobPtContexts]
- [kEobPt64SymbolCount + 1];
- uint16_t eob_pt_128_cdf[kNumPlaneTypes][kEobPtContexts]
- [kEobPt128SymbolCount + 1];
- uint16_t eob_pt_256_cdf[kNumPlaneTypes][kEobPtContexts]
- [kEobPt256SymbolCount + 1];
- uint16_t eob_pt_512_cdf[kNumPlaneTypes][kEobPt512SymbolCount + 1];
- uint16_t eob_pt_1024_cdf[kNumPlaneTypes][kEobPt1024SymbolCount + 1];
- uint16_t eob_extra_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
- [kEobExtraContexts][kBooleanFieldCdfSize];
- uint16_t coeff_base_eob_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
- [kCoeffBaseEobContexts]
- [kCoeffBaseEobSymbolCount + 1];
- uint16_t coeff_base_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
- [kCoeffBaseContexts][kCoeffBaseSymbolCount + 1];
- uint16_t coeff_base_range_cdf[kNumSquareTransformSizes][kNumPlaneTypes]
- [kCoeffBaseRangeContexts]
- [kCoeffBaseRangeSymbolCount + 1];
- uint16_t dc_sign_cdf[kNumPlaneTypes][kDcSignContexts][kBooleanFieldCdfSize];
- uint16_t restoration_type_cdf[kRestorationTypeSymbolCount + 1];
- uint16_t use_wiener_cdf[kBooleanFieldCdfSize];
- uint16_t use_sgrproj_cdf[kBooleanFieldCdfSize];
- uint16_t has_palette_y_cdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
- [kBooleanFieldCdfSize];
- uint16_t palette_y_size_cdf[kPaletteBlockSizeContexts]
- [kPaletteSizeSymbolCount + 1];
- uint16_t has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
- uint16_t palette_uv_size_cdf[kPaletteBlockSizeContexts]
- [kPaletteSizeSymbolCount + 1];
- uint16_t palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
- [kPaletteColorIndexContexts]
- [kPaletteColorIndexSymbolCount + 1];
- uint16_t is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
- uint16_t use_compound_reference_cdf[kUseCompoundReferenceContexts]
- [kBooleanFieldCdfSize];
- uint16_t compound_reference_type_cdf[kCompoundReferenceTypeContexts]
- [kBooleanFieldCdfSize];
- uint16_t compound_reference_cdf[kNumCompoundReferenceTypes]
- [kReferenceContexts][3][kBooleanFieldCdfSize];
- uint16_t compound_backward_reference_cdf[kReferenceContexts][2]
- [kBooleanFieldCdfSize];
- uint16_t single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
- uint16_t compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
- [kNumCompoundInterPredictionModes + 1];
- uint16_t new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
- uint16_t zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
- uint16_t reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
- uint16_t ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
- uint16_t is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
- uint16_t inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
- uint16_t is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
- uint16_t wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
- uint16_t use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
- uint16_t motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
- uint16_t is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
- [kBooleanFieldCdfSize];
- uint16_t is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
- [kBooleanFieldCdfSize];
- uint16_t compound_type_cdf[kMaxBlockSizes]
- [kNumExplicitCompoundPredictionTypes + 1];
- uint16_t interpolation_filter_cdf[kInterpolationFilterContexts]
- [kNumExplicitInterpolationFilters + 1];
- uint16_t mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
- uint16_t mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
- uint16_t mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
- uint16_t mv_class0_bit_cdf[kMvContexts][kNumMvComponents]
- [kBooleanFieldCdfSize];
- uint16_t mv_class0_fraction_cdf[kMvContexts][kNumMvComponents]
- [kBooleanSymbolCount]
- [kMvFractionSymbolCount + 1];
- uint16_t mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
- [kBooleanFieldCdfSize];
- uint16_t mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
- [kBooleanFieldCdfSize];
- uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
- [kMvFractionSymbolCount + 1];
- uint16_t mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+ alignas(kMaxAlignment) uint16_t
+ palette_y_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ has_palette_uv_cdf[kPaletteUVModeContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ palette_uv_size_cdf[kPaletteBlockSizeContexts]
+ [kPaletteSizeSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ palette_color_index_cdf[kNumPlaneTypes][kPaletteSizeSymbolCount]
+ [kPaletteColorIndexContexts]
+ [kPaletteColorIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_cdf[kIsInterContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ use_compound_reference_cdf[kUseCompoundReferenceContexts]
[kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_type_cdf[kCompoundReferenceTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_reference_cdf[kNumCompoundReferenceTypes][kReferenceContexts][3]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_backward_reference_cdf[kReferenceContexts][2]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ single_reference_cdf[kReferenceContexts][6][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_prediction_mode_cdf[kCompoundPredictionModeContexts]
+ [kNumCompoundInterPredictionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ new_mv_cdf[kNewMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ zero_mv_cdf[kZeroMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ reference_mv_cdf[kReferenceMvContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ ref_mv_index_cdf[kRefMvIndexContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_inter_intra_cdf[kInterIntraContexts][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ inter_intra_mode_cdf[kInterIntraContexts][kNumInterIntraModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_wedge_inter_intra_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ wedge_index_cdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ use_obmc_cdf[kMaxBlockSizes][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ motion_mode_cdf[kMaxBlockSizes][kNumMotionModes + 1];
+ alignas(kMaxAlignment) uint16_t
+ is_explicit_compound_type_cdf[kIsExplicitCompoundTypeContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ is_compound_type_average_cdf[kIsCompoundTypeAverageContexts]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ compound_type_cdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ interpolation_filter_cdf[kInterpolationFilterContexts]
+ [kNumExplicitInterpolationFilters + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_joint_cdf[kMvContexts][kNumMvJointTypes + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_sign_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class_cdf[kMvContexts][kNumMvComponents][kMvClassSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_bit_cdf[kMvContexts][kNumMvComponents][kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_fraction_cdf[kMvContexts][kNumMvComponents][kBooleanSymbolCount]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_class0_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t
+ mv_bit_cdf[kMvContexts][kNumMvComponents][kMvBitSymbolCount]
+ [kBooleanFieldCdfSize];
+ alignas(kMaxAlignment) uint16_t mv_fraction_cdf[kMvContexts][kNumMvComponents]
+ [kMvFractionSymbolCount + 1];
+ alignas(kMaxAlignment) uint16_t
+ mv_high_precision_cdf[kMvContexts][kNumMvComponents]
+ [kBooleanFieldCdfSize];
};
} // namespace libgav1
diff --git a/libgav1/src/symbol_decoder_context_cdfs.inc b/libgav1/src/symbol_decoder_context_cdfs.inc
index 7f8f2c2..509286f 100644
--- a/libgav1/src/symbol_decoder_context_cdfs.inc
+++ b/libgav1/src/symbol_decoder_context_cdfs.inc
@@ -15,7 +15,7 @@
// This file is just a convenience to separate out all the CDF constant
// definitions from the symbol decoder context functions.
-constexpr uint16_t kDefaultPartitionCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPartitionCdf
[kBlockWidthCount][kPartitionContexts][kMaxPartitionTypes + 1] = {
// width 8
{{13636, 7258, 2376, 0, 0},
@@ -43,32 +43,34 @@
{27339, 26092, 25646, 741, 541, 237, 186, 0, 0},
{32057, 31802, 31596, 320, 230, 151, 104, 0, 0}}};
-constexpr uint16_t kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] =
- {{27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
- {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
- {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSegmentIdCdf[kSegmentIdContexts][kMaxSegments + 1] = {
+ {27146, 24875, 16675, 14535, 4959, 4395, 235, 0, 0},
+ {18494, 14538, 10211, 7833, 2788, 1917, 424, 0, 0},
+ {5241, 4281, 4045, 3878, 371, 121, 89, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultUsePredictedSegmentIdCdf[kUsePredictedSegmentIdContexts]
[kBooleanFieldCdfSize] = {{16384, 0, 0},
{16384, 0, 0},
{16384, 0, 0}};
-constexpr uint16_t kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
- {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultSkipCdf[kSkipContexts][kBooleanFieldCdfSize] = {
+ {1097, 0, 0}, {16253, 0, 0}, {28192, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultSkipModeCdf[kSkipModeContexts][kBooleanFieldCdfSize] = {
{147, 0, 0}, {12060, 0, 0}, {24641, 0, 0}};
// This constant is also used for DeltaLf and DeltaLfMulti.
-constexpr uint16_t kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0,
- 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultDeltaQCdf[kDeltaSymbolCount + 1] = {4608, 648, 91, 0, 0};
-constexpr uint16_t kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0,
- 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIntraBlockCopyCdf[kBooleanFieldCdfSize] = {2237, 0, 0};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultIntraFrameYModeCdf[kIntraModeContexts][kIntraModeContexts]
[kIntraPredictionModesY + 1] = {
{{17180, 15741, 13430, 12550, 12086, 11658,
@@ -122,7 +124,7 @@
{25150, 24480, 22909, 22259, 17382, 14111,
9865, 3992, 3588, 1413, 966, 175, 0, 0}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultYModeCdf[kYModeContexts][kIntraPredictionModesY + 1] = {
{9967, 9279, 8475, 8012, 7167, 6645, 6162, 5350, 4823, 3540, 3083, 2419,
0, 0},
@@ -133,7 +135,7 @@
{12613, 11467, 9930, 9590, 9507, 9235, 9065, 7964, 7416, 6193, 5752,
4719, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultAngleDeltaCdf[kDirectionalIntraModes][kAngleDeltaSymbolCount + 1] =
{{30588, 27736, 25201, 9992, 5779, 2551, 0, 0},
{30467, 27160, 23967, 9281, 5794, 2438, 0, 0},
@@ -144,7 +146,7 @@
{30528, 21672, 17315, 12427, 10207, 3851, 0, 0},
{29163, 22340, 20309, 15092, 11524, 2113, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultUVModeCdf[kBooleanSymbolCount][kIntraPredictionModesY]
[kIntraPredictionModesUV + 1] = {
// CFL not allowed.
@@ -202,24 +204,26 @@
{29624, 27681, 25386, 25264, 25175, 25078, 24967,
24704, 24536, 23520, 22893, 22247, 3720, 0, 0}}};
-constexpr uint16_t kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
- 31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaSignsCdf[kCflAlphaSignsSymbolCount + 1] = {
+ 31350, 30645, 19428, 14363, 5796, 4425, 474, 0, 0};
-constexpr uint16_t kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount +
- 1] = {
- {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44, 0,
- 0},
- {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88, 84,
- 0, 0},
- {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75, 71,
- 0, 0},
- {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0, 0},
- {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192, 175,
- 146, 0, 0},
- {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174, 146,
- 112, 108, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCflAlphaCdf[kCflAlphaContexts][kCflAlphaSymbolCount + 1] = {
+ {25131, 12049, 1367, 287, 111, 80, 76, 72, 68, 64, 60, 56, 52, 48, 44,
+ 0, 0},
+ {18403, 9165, 4633, 1600, 601, 373, 281, 195, 148, 121, 100, 96, 92, 88,
+ 84, 0, 0},
+ {21236, 10388, 4323, 1408, 419, 245, 184, 119, 95, 91, 87, 83, 79, 75,
+ 71, 0, 0},
+ {5778, 1366, 486, 197, 76, 72, 68, 64, 60, 56, 52, 48, 44, 40, 36, 0,
+ 0},
+ {15520, 6710, 3864, 2160, 1463, 891, 642, 447, 374, 304, 252, 208, 192,
+ 175, 146, 0, 0},
+ {18030, 11090, 6989, 4867, 3744, 2466, 1788, 925, 624, 355, 248, 174,
+ 146, 112, 108, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultUseFilterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
{28147, 0, 0}, {26025, 0, 0}, {19998, 0, 0}, {26875, 0, 0},
{24902, 0, 0}, {20217, 0, 0}, {12539, 0, 0}, {22400, 0, 0},
@@ -228,25 +232,28 @@
{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
{16384, 0, 0}, {16384, 0, 0}};
-constexpr uint16_t kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
- 23819, 19992, 15557, 3210, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultFilterIntraModeCdf[kNumFilterIntraPredictors + 1] = {
+ 23819, 19992, 15557, 3210, 0, 0};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultTxDepthCdf[4][kTxDepthContexts][kMaxTxDepthSymbolCount + 1] = {
{{12800, 0, 0}, {12800, 0, 0}, {8448, 0, 0}},
{{20496, 2596, 0, 0}, {20496, 2596, 0, 0}, {14091, 1920, 0, 0}},
{{19782, 17588, 0, 0}, {19782, 17588, 0, 0}, {8466, 7166, 0, 0}},
{{26986, 21293, 0, 0}, {26986, 21293, 0, 0}, {15965, 10009, 0, 0}}};
-constexpr uint16_t kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] =
- {{4187, 0, 0}, {8922, 0, 0}, {11921, 0, 0}, {8453, 0, 0}, {14572, 0, 0},
- {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0}, {21763, 0, 0}, {5589, 0, 0},
- {12764, 0, 0}, {21487, 0, 0}, {6219, 0, 0}, {13460, 0, 0}, {18544, 0, 0},
- {4753, 0, 0}, {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0}, {10367, 0, 0},
- {16680, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultTxSplitCdf[kTxSplitContexts][kBooleanFieldCdfSize] = {
+ {4187, 0, 0}, {8922, 0, 0}, {11921, 0, 0}, {8453, 0, 0},
+ {14572, 0, 0}, {20635, 0, 0}, {13977, 0, 0}, {21881, 0, 0},
+ {21763, 0, 0}, {5589, 0, 0}, {12764, 0, 0}, {21487, 0, 0},
+ {6219, 0, 0}, {13460, 0, 0}, {18544, 0, 0}, {4753, 0, 0},
+ {11222, 0, 0}, {18368, 0, 0}, {4603, 0, 0}, {10367, 0, 0},
+ {16680, 0, 0}};
/* clang-format off */
-constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultAllZeroCdf[kCoefficientQuantizerContexts]
[kNumSquareTransformSizes][kAllZeroContexts]
[kBooleanFieldCdfSize] = {
{
@@ -320,7 +327,7 @@
};
/* clang-format on */
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultInterTxTypeCdf[3][kNumExtendedTransformSizes][kNumTransformTypes +
1] = {
{{28310, 27208, 25073, 23059, 19438, 17979, 15231, 12502, 11264, 9920,
@@ -339,7 +346,7 @@
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}},
{{16384, 0, 0}, {28601, 0, 0}, {30770, 0, 0}, {32020, 0, 0}}};
-constexpr uint16_t kDefaultIntraTxTypeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultIntraTxTypeCdf
[2][kNumExtendedTransformSizes][kIntraPredictionModesY]
[kNumTransformTypes + 1] = {
{{{31233, 24733, 23307, 20017, 9301, 4943, 0, 0},
@@ -408,26 +415,26 @@
{32685, 27153, 20767, 15540, 0, 0},
{30800, 27212, 20745, 14221, 0, 0}}}};
-constexpr uint16_t kDefaultEobPt16Cdf[kCoefficientQuantizerContexts]
- [kNumPlaneTypes][kEobPtContexts]
- [kEobPt16SymbolCount + 1] = {
- {{{31928, 31729, 30788, 27873, 0, 0},
- {32398, 32097, 30885, 28297, 0, 0}},
- {{29521, 27818, 23080, 18205, 0, 0},
- {30864, 29414, 25005, 18121, 0, 0}}},
- {{{30643, 30217, 27603, 23822, 0, 0},
- {32255, 32003, 30909, 26429, 0, 0}},
- {{25131, 23270, 18509, 13660, 0, 0},
- {30271, 28672, 23902, 15775, 0, 0}}},
- {{{28752, 27871, 23887, 17800, 0, 0},
- {32052, 31663, 30122, 22712, 0, 0}},
- {{21629, 19498, 14527, 9202, 0, 0},
- {29576, 27736, 22471, 13013, 0, 0}}},
- {{{26060, 23810, 18022, 10635, 0, 0},
- {31546, 30694, 27985, 17358, 0, 0}},
- {{13193, 11002, 6724, 3059, 0, 0},
- {25471, 22001, 13495, 4574, 0, 0}}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultEobPt16Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+ [kEobPtContexts][kEobPt16SymbolCount + 1] = {
+ {{{31928, 31729, 30788, 27873, 0, 0},
+ {32398, 32097, 30885, 28297, 0, 0}},
+ {{29521, 27818, 23080, 18205, 0, 0},
+ {30864, 29414, 25005, 18121, 0, 0}}},
+ {{{30643, 30217, 27603, 23822, 0, 0},
+ {32255, 32003, 30909, 26429, 0, 0}},
+ {{25131, 23270, 18509, 13660, 0, 0},
+ {30271, 28672, 23902, 15775, 0, 0}}},
+ {{{28752, 27871, 23887, 17800, 0, 0},
+ {32052, 31663, 30122, 22712, 0, 0}},
+ {{21629, 19498, 14527, 9202, 0, 0},
+ {29576, 27736, 22471, 13013, 0, 0}}},
+ {{{26060, 23810, 18022, 10635, 0, 0},
+ {31546, 30694, 27985, 17358, 0, 0}},
+ {{13193, 11002, 6724, 3059, 0, 0},
+ {25471, 22001, 13495, 4574, 0, 0}}}};
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultEobPt32Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
[kEobPtContexts][kEobPt32SymbolCount + 1] = {
{{{32368, 32248, 31791, 30666, 26226, 0, 0},
@@ -446,7 +453,7 @@
{31612, 31066, 29093, 23494, 12229, 0, 0}},
{{10682, 8486, 5758, 2998, 1025, 0, 0},
{25069, 21871, 11877, 5842, 1140, 0, 0}}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultEobPt64Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
[kEobPtContexts][kEobPt64SymbolCount + 1] = {
{{{32439, 32270, 31667, 30984, 29503, 25010, 0, 0},
@@ -465,7 +472,7 @@
{31479, 30448, 28797, 24842, 18615, 8477, 0, 0}},
{{8556, 7060, 4500, 2733, 1461, 719, 0, 0},
{24042, 20390, 13359, 6318, 2730, 306, 0, 0}}}};
-constexpr uint16_t kDefaultEobPt128Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt128Cdf
[kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
[kEobPt128SymbolCount + 1] = {
{{{32549, 32286, 31628, 30677, 29088, 26740, 20182, 0, 0},
@@ -485,7 +492,7 @@
{{8455, 6706, 4383, 2661, 1551, 870, 423, 0, 0},
{23603, 19486, 11618, 2482, 874, 197, 56, 0, 0}}}};
-constexpr uint16_t kDefaultEobPt256Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt256Cdf
[kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPtContexts]
[kEobPt256SymbolCount + 1] = {
{{{32458, 32184, 30881, 29179, 26600, 24157, 21416, 17116, 0, 0},
@@ -505,7 +512,7 @@
{{9658, 8171, 5628, 3874, 2601, 1841, 1376, 674, 0, 0},
{22770, 15107, 7590, 4671, 1460, 730, 365, 73, 0, 0}}}};
-constexpr uint16_t kDefaultEobPt512Cdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobPt512Cdf
[kCoefficientQuantizerContexts][kNumPlaneTypes][kEobPt512SymbolCount + 1] =
{{{32127, 31785, 29061, 27338, 22534, 17810, 13980, 9356, 6707, 0, 0},
{27673, 26322, 22772, 19414, 16751, 14782, 11849, 6639, 3628, 0, 0}},
@@ -516,7 +523,7 @@
{{26841, 24959, 21845, 18171, 13329, 8633, 4312, 1626, 708, 0, 0},
{11675, 9725, 7026, 5110, 3671, 3052, 2695, 1948, 812, 0, 0}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultEobPt1024Cdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
[kEobPt1024SymbolCount + 1] = {
{{32375, 32347, 32017, 31145, 29608, 26416, 19423,
@@ -537,7 +544,7 @@
2961, 198, 0, 0}}};
/* clang-format off */
-constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultEobExtraCdf[kCoefficientQuantizerContexts]
[kNumSquareTransformSizes][kNumPlaneTypes]
[kEobExtraContexts][kBooleanFieldCdfSize] = {
{
@@ -710,7 +717,7 @@
}
};
-constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseEobCdf[kCoefficientQuantizerContexts]
[kNumSquareTransformSizes][kNumPlaneTypes]
[kCoeffBaseEobContexts]
[kCoeffBaseEobSymbolCount + 1] = {
@@ -845,7 +852,7 @@
};
/* clang-format on */
-constexpr uint16_t kDefaultCoeffBaseCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseCdf
[kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1] = {
{{{{28734, 23838, 20041, 0, 0}, {14686, 3027, 891, 0, 0},
@@ -1689,7 +1696,7 @@
{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0},
{24576, 16384, 8192, 0, 0}, {24576, 16384, 8192, 0, 0}}}}};
-constexpr uint16_t kDefaultCoeffBaseRangeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCoeffBaseRangeCdf
[kCoefficientQuantizerContexts][kNumSquareTransformSizes][kNumPlaneTypes]
[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount + 1] = {
{{{{18470, 12050, 8594, 0, 0}, {20232, 13167, 8979, 0, 0},
@@ -2134,7 +2141,7 @@
{24576, 16384, 8192, 0, 0}}}}};
/* clang-format off */
-constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultDcSignCdf[kCoefficientQuantizerContexts][kNumPlaneTypes]
[kDcSignContexts][kBooleanFieldCdfSize] = {
{{{16768, 0, 0}, {19712, 0, 0}, {13952, 0, 0}}, {{17536, 0, 0}, {19840, 0, 0},
{15488, 0, 0}}},
@@ -2146,14 +2153,17 @@
{15488, 0, 0}}}
};
/* clang-format on */
-constexpr uint16_t kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] =
- {23355, 10187, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultRestorationTypeCdf[kRestorationTypeSymbolCount + 1] = {23355, 10187,
+ 0, 0};
-constexpr uint16_t kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseWienerCdf[kBooleanFieldCdfSize] = {21198, 0, 0};
-constexpr uint16_t kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseSgrProjCdf[kBooleanFieldCdfSize] = {15913, 0, 0};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultHasPaletteYCdf[kPaletteBlockSizeContexts][kPaletteYModeContexts]
[kBooleanFieldCdfSize] = {
{{1092, 0, 0}, {29349, 0, 0}, {31507, 0, 0}},
@@ -2164,7 +2174,7 @@
{{503, 0, 0}, {28753, 0, 0}, {31247, 0, 0}},
{{318, 0, 0}, {24822, 0, 0}, {32639, 0, 0}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultPaletteYSizeCdf[kPaletteBlockSizeContexts]
[kPaletteSizeSymbolCount + 1] = {
{24816, 19768, 14619, 11290, 7241, 3527, 0, 0},
@@ -2175,11 +2185,11 @@
{23057, 17880, 15845, 11716, 7107, 4893, 0, 0},
{17828, 11971, 11090, 8582, 5735, 3769, 0, 0}};
-constexpr uint16_t kDefaultHasPaletteUVCdf[kPaletteUVModeContexts]
- [kBooleanFieldCdfSize] = {
- {307, 0, 0}, {11280, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultHasPaletteUVCdf[kPaletteUVModeContexts][kBooleanFieldCdfSize] = {
+ {307, 0, 0}, {11280, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultPaletteUVSizeCdf[kPaletteBlockSizeContexts]
[kPaletteSizeSymbolCount + 1] = {
{24055, 12789, 5640, 3159, 1437, 496, 0, 0},
@@ -2191,7 +2201,7 @@
{31499, 27333, 22335, 13805, 11068, 6903, 0,
0}};
-constexpr uint16_t kDefaultPaletteColorIndexCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultPaletteColorIndexCdf
[kNumPlaneTypes][kPaletteSizeSymbolCount][kPaletteColorIndexContexts]
[kPaletteColorIndexSymbolCount + 1] = {
{{{4058, 0, 0},
@@ -2265,10 +2275,11 @@
{14803, 12684, 10536, 8794, 6494, 4366, 2378, 0, 0},
{1578, 1439, 1252, 1089, 943, 742, 446, 0, 0}}}};
-constexpr uint16_t kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] =
- {{31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultIsInterCdf[kIsInterContexts][kBooleanFieldCdfSize] = {
+ {31962, 0, 0}, {16106, 0, 0}, {12582, 0, 0}, {6230, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultUseCompoundReferenceCdf[kUseCompoundReferenceContexts]
[kBooleanFieldCdfSize] = {{5940, 0, 0},
{8733, 0, 0},
@@ -2276,7 +2287,7 @@
{22128, 0, 0},
{29867, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultCompoundReferenceTypeCdf[kCompoundReferenceTypeContexts]
[kBooleanFieldCdfSize] = {{31570, 0, 0},
{30698, 0, 0},
@@ -2284,7 +2295,7 @@
{25269, 0, 0},
{10293, 0, 0}};
-constexpr uint16_t kDefaultCompoundReferenceCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundReferenceCdf
[kNumCompoundReferenceTypes][kReferenceContexts][3][kBooleanFieldCdfSize] =
{{{{27484, 0, 0}, {28903, 0, 0}, {29640, 0, 0}},
{{9616, 0, 0}, {18595, 0, 0}, {17498, 0, 0}},
@@ -2293,7 +2304,7 @@
{{12877, 0, 0}, {10327, 0, 0}, {17608, 0, 0}},
{{2037, 0, 0}, {1709, 0, 0}, {5224, 0, 0}}}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultCompoundBackwardReferenceCdf[kReferenceContexts][2]
[kBooleanFieldCdfSize] = {
{{30533, 0, 0}, {31345, 0, 0}},
@@ -2301,7 +2312,7 @@
{{2162, 0, 0}, {2279, 0, 0}}};
/* clang-format off */
-constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
+alignas(kMaxAlignment) constexpr uint16_t kDefaultSingleReferenceCdf[kReferenceContexts][6]
[kBooleanFieldCdfSize] = {
{{27871, 0, 0}, {31213, 0, 0}, {28532, 0, 0}, {24118, 0, 0}, {31864, 0, 0},
{31324, 0, 0}},
@@ -2311,7 +2322,7 @@
{2464, 0, 0}}};
/* clang-format on */
-constexpr uint16_t kDefaultCompoundPredictionModeCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultCompoundPredictionModeCdf
[kCompoundPredictionModeContexts][kNumCompoundInterPredictionModes + 1] = {
{25008, 18945, 16960, 15127, 13612, 12102, 5877, 0, 0},
{22038, 13316, 11623, 10019, 8729, 7637, 4044, 0, 0},
@@ -2322,35 +2333,37 @@
{15643, 8495, 6954, 5276, 4554, 4064, 2176, 0, 0},
{19722, 9554, 8263, 6826, 5333, 4326, 3438, 0, 0}};
-constexpr uint16_t kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
- {8733, 0, 0}, {16138, 0, 0}, {17429, 0, 0},
- {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultNewMvCdf[kNewMvContexts][kBooleanFieldCdfSize] = {
+ {8733, 0, 0}, {16138, 0, 0}, {17429, 0, 0},
+ {24382, 0, 0}, {20546, 0, 0}, {28092, 0, 0}};
-constexpr uint16_t kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {
- {30593, 0, 0}, {31714, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultZeroMvCdf[kZeroMvContexts][kBooleanFieldCdfSize] = {{30593, 0, 0},
+ {31714, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultReferenceMvCdf[kReferenceMvContexts][kBooleanFieldCdfSize] = {
{8794, 0, 0}, {8580, 0, 0}, {14920, 0, 0},
{4146, 0, 0}, {8456, 0, 0}, {12845, 0, 0}};
// This is called drl_mode in the spec where DRL stands for Dynamic Reference
// List.
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultRefMvIndexCdf[kRefMvIndexContexts][kBooleanFieldCdfSize] = {
{19664, 0, 0}, {8208, 0, 0}, {13823, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultIsInterIntraCdf[kInterIntraContexts][kBooleanFieldCdfSize] = {
{5881, 0, 0}, {5171, 0, 0}, {2531, 0, 0}};
-constexpr uint16_t kDefaultInterIntraModeCdf[kInterIntraContexts]
- [kNumInterIntraModes + 1] = {
- {30893, 21686, 5436, 0, 0},
- {30295, 22772, 6380, 0, 0},
- {28530, 21231, 6842, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultInterIntraModeCdf[kInterIntraContexts][kNumInterIntraModes + 1] = {
+ {30893, 21686, 5436, 0, 0},
+ {30295, 22772, 6380, 0, 0},
+ {28530, 21231, 6842, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultIsWedgeInterIntraCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
{12732, 0, 0}, {7811, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
@@ -2359,7 +2372,7 @@
{16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
{16384, 0, 0}, {16384, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultWedgeIndexCdf[kMaxBlockSizes][kWedgeIndexSymbolCount + 1] = {
{30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
10240, 8192, 6144, 4096, 2048, 0, 0},
@@ -2406,47 +2419,53 @@
{30720, 28672, 26624, 24576, 22528, 20480, 18432, 16384, 14336, 12288,
10240, 8192, 6144, 4096, 2048, 0, 0}};
-constexpr uint16_t kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
- {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {22331, 0, 0},
- {23397, 0, 0}, {9104, 0, 0}, {16384, 0, 0}, {23467, 0, 0}, {15336, 0, 0},
- {18345, 0, 0}, {8760, 0, 0}, {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0},
- {9945, 0, 0}, {5889, 0, 0}, {10685, 0, 0}, {2640, 0, 0}, {1754, 0, 0},
- {1208, 0, 0}, {130, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultUseObmcCdf[kMaxBlockSizes][kBooleanFieldCdfSize] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {22331, 0, 0}, {23397, 0, 0}, {9104, 0, 0}, {16384, 0, 0},
+ {23467, 0, 0}, {15336, 0, 0}, {18345, 0, 0}, {8760, 0, 0},
+ {11867, 0, 0}, {17626, 0, 0}, {6951, 0, 0}, {9945, 0, 0},
+ {5889, 0, 0}, {10685, 0, 0}, {2640, 0, 0}, {1754, 0, 0},
+ {1208, 0, 0}, {130, 0, 0}};
-constexpr uint16_t kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] =
- {{21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
- {21845, 10923, 0, 0}, {25117, 8008, 0, 0}, {28030, 8003, 0, 0},
- {3969, 1378, 0, 0}, {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
- {13349, 5958, 0, 0}, {27645, 9162, 0, 0}, {3795, 1174, 0, 0},
- {6337, 1994, 0, 0}, {21162, 8460, 0, 0}, {6508, 3652, 0, 0},
- {12408, 4706, 0, 0}, {3026, 1565, 0, 0}, {11089, 5938, 0, 0},
- {3252, 2067, 0, 0}, {3870, 2371, 0, 0}, {1890, 1433, 0, 0},
- {261, 210, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMotionModeCdf[kMaxBlockSizes][kNumMotionModes + 1] = {
+ {21845, 10923, 0, 0}, {21845, 10923, 0, 0}, {21845, 10923, 0, 0},
+ {21845, 10923, 0, 0}, {25117, 8008, 0, 0}, {28030, 8003, 0, 0},
+ {3969, 1378, 0, 0}, {21845, 10923, 0, 0}, {27377, 7240, 0, 0},
+ {13349, 5958, 0, 0}, {27645, 9162, 0, 0}, {3795, 1174, 0, 0},
+ {6337, 1994, 0, 0}, {21162, 8460, 0, 0}, {6508, 3652, 0, 0},
+ {12408, 4706, 0, 0}, {3026, 1565, 0, 0}, {11089, 5938, 0, 0},
+ {3252, 2067, 0, 0}, {3870, 2371, 0, 0}, {1890, 1433, 0, 0},
+ {261, 210, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultIsExplicitCompoundTypeCdf[kIsExplicitCompoundTypeContexts]
[kBooleanFieldCdfSize] = {
{6161, 0, 0}, {9877, 0, 0},
{13928, 0, 0}, {8174, 0, 0},
{12834, 0, 0}, {10094, 0, 0}};
-constexpr uint16_t
+alignas(kMaxAlignment) constexpr uint16_t
kDefaultIsCompoundTypeAverageCdf[kIsCompoundTypeAverageContexts]
[kBooleanFieldCdfSize] = {
{14524, 0, 0}, {19903, 0, 0},
{25715, 0, 0}, {19509, 0, 0},
{23434, 0, 0}, {28124, 0, 0}};
-constexpr uint16_t kDefaultCompoundTypeCdf
- [kMaxBlockSizes][kNumExplicitCompoundPredictionTypes + 1] = {
- {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
- {9337, 0, 0}, {19597, 0, 0}, {20948, 0, 0}, {16384, 0, 0},
- {21298, 0, 0}, {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
- {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0}, {16384, 0, 0},
- {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
- {16384, 0, 0}, {16384, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultCompoundTypeCdf[kMaxBlockSizes]
+ [kNumExplicitCompoundPredictionTypes + 1] = {
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {9337, 0, 0}, {19597, 0, 0},
+ {20948, 0, 0}, {16384, 0, 0}, {21298, 0, 0},
+ {22998, 0, 0}, {23668, 0, 0}, {16384, 0, 0},
+ {25067, 0, 0}, {24535, 0, 0}, {26596, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}, {16384, 0, 0}, {16384, 0, 0},
+ {16384, 0, 0}};
-constexpr uint16_t kDefaultInterpolationFilterCdf
+alignas(kMaxAlignment) constexpr uint16_t kDefaultInterpolationFilterCdf
[kInterpolationFilterContexts][kNumExplicitInterpolationFilters + 1] = {
{833, 48, 0, 0}, {27200, 49, 0, 0}, {32346, 29830, 0, 0},
{4524, 160, 0, 0}, {1562, 815, 0, 0}, {27906, 647, 0, 0},
@@ -2455,30 +2474,36 @@
{1746, 759, 0, 0}, {29805, 675, 0, 0}, {32167, 31825, 0, 0},
{17799, 11370, 0, 0}};
-constexpr uint16_t kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504,
- 13440, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvJointCdf[kNumMvJointTypes + 1] = {28672, 21504, 13440, 0, 0};
-constexpr uint16_t kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvSignCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
-constexpr uint16_t kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
- 4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClassCdf[kMvClassSymbolCount + 1] = {
+ 4096, 1792, 910, 448, 217, 112, 28, 11, 6, 1, 0};
-constexpr uint16_t kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0BitCdf[kBooleanFieldCdfSize] = {5120, 0, 0};
-constexpr uint16_t kDefaultMvClass0FractionCdf[kBooleanSymbolCount]
- [kMvFractionSymbolCount + 1] = {
- {16384, 8192, 6144, 0, 0},
- {20480, 11520, 8640, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0FractionCdf[kBooleanSymbolCount][kMvFractionSymbolCount +
+ 1] = {
+ {16384, 8192, 6144, 0, 0}, {20480, 11520, 8640, 0, 0}};
-constexpr uint16_t kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {
- 12288, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvClass0HighPrecisionCdf[kBooleanFieldCdfSize] = {12288, 0, 0};
-constexpr uint16_t kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
- {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0}, {10240, 0, 0},
- {8192, 0, 0}, {4096, 0, 0}, {2816, 0, 0}, {2816, 0, 0}, {2048, 0, 0}};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvBitCdf[kMvBitSymbolCount][kBooleanFieldCdfSize] = {
+ {15360, 0, 0}, {14848, 0, 0}, {13824, 0, 0}, {12288, 0, 0},
+ {10240, 0, 0}, {8192, 0, 0}, {4096, 0, 0}, {2816, 0, 0},
+ {2816, 0, 0}, {2048, 0, 0}};
-constexpr uint16_t kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {
- 24576, 15360, 11520, 0, 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvFractionCdf[kMvFractionSymbolCount + 1] = {24576, 15360, 11520, 0,
+ 0};
-constexpr uint16_t kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0,
- 0};
+alignas(kMaxAlignment) constexpr uint16_t
+ kDefaultMvHighPrecisionCdf[kBooleanFieldCdfSize] = {16384, 0, 0};
diff --git a/libgav1/src/threading_strategy.cc b/libgav1/src/threading_strategy.cc
index 2864c34..17ce18f 100644
--- a/libgav1/src/threading_strategy.cc
+++ b/libgav1/src/threading_strategy.cc
@@ -27,7 +27,7 @@
namespace {
#if !defined(LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER)
-constexpr int kFrameParallelThresholdMultiplier = 4;
+constexpr int kFrameParallelThresholdMultiplier = 3;
#else
constexpr int kFrameParallelThresholdMultiplier =
LIBGAV1_FRAME_PARALLEL_THRESHOLD_MULTIPLIER;
@@ -36,24 +36,25 @@
// Computes the number of frame threads to be used based on the following
// heuristic:
// * If |thread_count| == 1, return 0.
-// * If |thread_count| <= |tile_count| * 4, return 0.
+// * If |thread_count| <= |tile_count| * kFrameParallelThresholdMultiplier,
+// return 0.
// * Otherwise, return the largest value of i which satisfies the following
// condition: i + i * tile_columns <= thread_count. This ensures that there
// are at least |tile_columns| worker threads for each frame thread.
// * This function will never return 1 or a value > |thread_count|.
//
-// This heuristic is based empirical performance data. The in-frame threading
-// model (combination of tile multithreading, superblock row multithreading and
-// post filter multithreading) performs better than the frame parallel model
-// until we reach the threshold of |thread_count| > |tile_count| *
-// kFrameParallelThresholdMultiplier.
+// This heuristic is based on empirical performance data. The in-frame
+// threading model (combination of tile multithreading, superblock row
+// multithreading and post filter multithreading) performs better than the
+// frame parallel model until we reach the threshold of |thread_count| >
+// |tile_count| * kFrameParallelThresholdMultiplier.
//
// It is a function of |tile_count| since tile threading and superblock row
-// multithreading will scale only as a factor of |tile_count|. The threshold 4
-// is arrived at based on empirical data. The general idea is that superblock
-// row multithreading plateaus at 4 * |tile_count| because in most practical
-// cases there aren't more than that many superblock rows and columns available
-// to work on in parallel.
+// multithreading will scale only as a factor of |tile_count|. The threshold
+// kFrameParallelThresholdMultiplier is arrived at based on empirical data.
+// The general idea is that superblock row multithreading plateaus at 4 *
+// |tile_count| because in most practical cases there aren't more than that
+// many superblock rows and columns available to work on in parallel.
int ComputeFrameThreadCount(int thread_count, int tile_count,
int tile_columns) {
assert(thread_count > 0);
@@ -132,7 +133,7 @@
thread_count -= 2;
if (thread_count <= 0) break;
}
-#else // !defined(__ANDROID__)
+#else // !defined(__ANDROID__)
// Assign the remaining threads to each Tile.
for (int i = 0; i < tile_count; ++i) {
const int count = thread_count / tile_count +
diff --git a/libgav1/src/tile.h b/libgav1/src/tile.h
index 065ef70..6bae2a0 100644
--- a/libgav1/src/tile.h
+++ b/libgav1/src/tile.h
@@ -48,7 +48,6 @@
#include "src/utils/constants.h"
#include "src/utils/entropy_decoder.h"
#include "src/utils/memory.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/segmentation_map.h"
#include "src/utils/threadpool.h"
#include "src/utils/types.h"
@@ -74,6 +73,7 @@
const ObuFrameHeader& frame_header, RefCountedBuffer* const current_frame,
const DecoderState& state, FrameScratchBuffer* const frame_scratch_buffer,
const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
SymbolDecoderContext* const saved_symbol_decoder_context,
const SegmentationMap* prev_segment_ids, PostFilter* const post_filter,
const dsp::Dsp* const dsp, ThreadPool* const thread_pool,
@@ -81,9 +81,10 @@
bool use_intra_prediction_buffer) {
std::unique_ptr<Tile> tile(new (std::nothrow) Tile(
tile_number, data, size, sequence_header, frame_header, current_frame,
- state, frame_scratch_buffer, wedge_masks, saved_symbol_decoder_context,
- prev_segment_ids, post_filter, dsp, thread_pool, pending_tiles,
- frame_parallel, use_intra_prediction_buffer));
+ state, frame_scratch_buffer, wedge_masks, quantizer_matrix,
+ saved_symbol_decoder_context, prev_segment_ids, post_filter, dsp,
+ thread_pool, pending_tiles, frame_parallel,
+ use_intra_prediction_buffer));
return (tile != nullptr && tile->Init()) ? std::move(tile) : nullptr;
}
@@ -186,17 +187,6 @@
int column4x4_end() const { return column4x4_end_; }
private:
- Tile(int tile_number, const uint8_t* data, size_t size,
- const ObuSequenceHeader& sequence_header,
- const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
- const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
- const WedgeMaskArray& wedge_masks,
- SymbolDecoderContext* saved_symbol_decoder_context,
- const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
- const dsp::Dsp* dsp, ThreadPool* thread_pool,
- BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
- bool use_intra_prediction_buffer);
-
// Stores the transform tree state when reading variable size transform trees
// and when applying the transform tree. When applying the transform tree,
// |depth| is not used.
@@ -248,6 +238,18 @@
// every transform block.
using ResidualPtr = uint8_t*;
+ Tile(int tile_number, const uint8_t* data, size_t size,
+ const ObuSequenceHeader& sequence_header,
+ const ObuFrameHeader& frame_header, RefCountedBuffer* current_frame,
+ const DecoderState& state, FrameScratchBuffer* frame_scratch_buffer,
+ const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
+ SymbolDecoderContext* saved_symbol_decoder_context,
+ const SegmentationMap* prev_segment_ids, PostFilter* post_filter,
+ const dsp::Dsp* dsp, ThreadPool* thread_pool,
+ BlockingCounterWithStatus* pending_tiles, bool frame_parallel,
+ bool use_intra_prediction_buffer);
+
// Performs member initializations that may fail. Helper function used by
// Create().
LIBGAV1_MUST_USE_RESULT bool Init();
@@ -289,26 +291,25 @@
// iteratively. It performs a DFS traversal over the partition tree to process
// the blocks in the right order.
bool ProcessPartition(
- int row4x4_start, int column4x4_start, ParameterTree* root,
- TileScratchBuffer* scratch_buffer,
+ int row4x4_start, int column4x4_start, TileScratchBuffer* scratch_buffer,
ResidualPtr* residual); // Iterative implementation of 5.11.4.
bool ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
- ParameterTree* tree, TileScratchBuffer* scratch_buffer,
+ TileScratchBuffer* scratch_buffer,
ResidualPtr* residual); // 5.11.5.
void ResetCdef(int row4x4, int column4x4); // 5.11.55.
// This function is used to decode a superblock when the parsing has already
// been done for that superblock.
- bool DecodeSuperBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
- ResidualPtr* residual);
+ bool DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* scratch_buffer);
// Helper function used by DecodeSuperBlock(). Note that the decode_block()
// function in the spec is equivalent to ProcessBlock() in the code.
- bool DecodeBlock(ParameterTree* tree, TileScratchBuffer* scratch_buffer,
- ResidualPtr* residual);
+ bool DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
+ TileScratchBuffer* scratch_buffer, ResidualPtr* residual);
void ClearBlockDecoded(TileScratchBuffer* scratch_buffer, int row4x4,
int column4x4); // 5.11.3.
- bool ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+ bool ProcessSuperBlock(int row4x4, int column4x4,
TileScratchBuffer* scratch_buffer,
ProcessingMode mode);
void ResetLoopRestorationParams();
@@ -357,7 +358,12 @@
const MvContexts& mode_contexts);
void ReadRefMvIndex(const Block& block);
void ReadInterIntraMode(const Block& block, bool is_compound); // 5.11.28.
- bool IsScaled(ReferenceFrameType type) const; // Part of 5.11.27.
+ bool IsScaled(ReferenceFrameType type) const { // Part of 5.11.27.
+ const int index =
+ frame_header_.reference_frame_index[type - kReferenceFrameLast];
+ return reference_frames_[index]->upscaled_width() != frame_header_.width ||
+ reference_frames_[index]->frame_height() != frame_header_.height;
+ }
void ReadMotionMode(const Block& block, bool is_compound); // 5.11.27.
uint16_t* GetIsExplicitCompoundTypeCdf(const Block& block);
uint16_t* GetIsCompoundTypeAverageCdf(const Block& block);
@@ -394,22 +400,28 @@
TransformSize tx_size); // 5.11.47.
template <typename ResidualType>
void ReadCoeffBase2D(
- const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* quantized_buffer);
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
template <typename ResidualType>
void ReadCoeffBaseHorizontal(
- const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* quantized_buffer);
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
template <typename ResidualType>
void ReadCoeffBaseVertical(
- const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* quantized_buffer);
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer, uint8_t* level_buffer);
int GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane);
void SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
uint8_t coefficient_level, int8_t dc_category);
@@ -495,9 +507,8 @@
const uint16_t* scan, int i, int q_value, const uint8_t* quantizer_matrix,
int shift, int max_value, uint16_t* dc_sign_cdf, int8_t* dc_category,
int* coefficient_level,
- ResidualType* residual_buffer); // Part of 5.11.39.
- int ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context,
- int plane_type); // Part of 5.11.39.
+ ResidualType* residual_buffer); // Part of 5.11.39.
+ int ReadCoeffBaseRange(uint16_t* cdf); // Part of 5.11.39.
// Returns the number of non-zero coefficients that were read. |tx_type| is an
// output parameter that stores the computed transform type for the plane
// whose coefficients were read. Returns -1 on failure.
@@ -637,6 +648,7 @@
TemporalMotionField& motion_field_;
const std::array<uint8_t, kNumReferenceFrameTypes>& reference_order_hint_;
const WedgeMaskArray& wedge_masks_;
+ const QuantizerMatrix& quantizer_matrix_;
DaalaBitReader reader_;
SymbolDecoderContext symbol_decoder_context_;
SymbolDecoderContext* const saved_symbol_decoder_context_;
diff --git a/libgav1/src/tile/bitstream/mode_info.cc b/libgav1/src/tile/bitstream/mode_info.cc
index d73ebed..0b22eb0 100644
--- a/libgav1/src/tile/bitstream/mode_info.cc
+++ b/libgav1/src/tile/bitstream/mode_info.cc
@@ -44,7 +44,6 @@
constexpr int kDeltaQSmall = 3;
constexpr int kDeltaLfSmall = 3;
-constexpr int kNoScale = 1 << kReferenceFrameScalePrecision;
constexpr uint8_t kIntraYModeContext[kIntraPredictionModesY] = {
0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0};
@@ -510,9 +509,9 @@
BlockParameters& bp = *block.bp;
const int context =
static_cast<int>(block.bp->prediction_parameters->use_intra_block_copy);
- const auto mv_joint = static_cast<MvJointType>(
- reader_.ReadSymbol(symbol_decoder_context_.mv_joint_cdf[context],
- static_cast<int>(kNumMvJointTypes)));
+ const auto mv_joint =
+ static_cast<MvJointType>(reader_.ReadSymbol<kNumMvJointTypes>(
+ symbol_decoder_context_.mv_joint_cdf[context]));
if (mv_joint == kMvJointTypeHorizontalZeroVerticalNonZero ||
mv_joint == kMvJointTypeNonZero) {
bp.mv.mv[index].mv[0] = ReadMotionVectorComponent(block, 0);
@@ -1032,21 +1031,6 @@
prediction_parameters.wedge_sign = 0;
}
-bool Tile::IsScaled(ReferenceFrameType type) const {
- const int index =
- frame_header_.reference_frame_index[type - kReferenceFrameLast];
- const int x_scale = ((reference_frames_[index]->upscaled_width()
- << kReferenceFrameScalePrecision) +
- DivideBy2(frame_header_.width)) /
- frame_header_.width;
- if (x_scale != kNoScale) return true;
- const int y_scale = ((reference_frames_[index]->frame_height()
- << kReferenceFrameScalePrecision) +
- DivideBy2(frame_header_.height)) /
- frame_header_.height;
- return y_scale != kNoScale;
-}
-
void Tile::ReadMotionMode(const Block& block, bool is_compound) {
BlockParameters& bp = *block.bp;
PredictionParameters& prediction_parameters =
diff --git a/libgav1/src/tile/bitstream/palette.cc b/libgav1/src/tile/bitstream/palette.cc
index 674d210..41b42d6 100644
--- a/libgav1/src/tile/bitstream/palette.cc
+++ b/libgav1/src/tile/bitstream/palette.cc
@@ -130,10 +130,10 @@
void Tile::ReadPaletteModeInfo(const Block& block) {
BlockParameters& bp = *block.bp;
+ bp.palette_mode_info.size[kPlaneTypeY] = 0;
+ bp.palette_mode_info.size[kPlaneTypeUV] = 0;
if (IsBlockSmallerThan8x8(block.size) || block.size > kBlock64x64 ||
!frame_header_.allow_screen_content_tools) {
- bp.palette_mode_info.size[kPlaneTypeY] = 0;
- bp.palette_mode_info.size[kPlaneTypeUV] = 0;
return;
}
const int block_size_context =
@@ -156,7 +156,7 @@
ReadPaletteColors(block, kPlaneY);
}
}
- if (bp.uv_mode == kPredictionModeDc && block.HasChroma()) {
+ if (block.HasChroma() && bp.uv_mode == kPredictionModeDc) {
const int context =
static_cast<int>(bp.palette_mode_info.size[kPlaneTypeY] > 0);
const bool has_palette_uv =
diff --git a/libgav1/src/tile/bitstream/partition.cc b/libgav1/src/tile/bitstream/partition.cc
index 60899a2..f3dbbb0 100644
--- a/libgav1/src/tile/bitstream/partition.cc
+++ b/libgav1/src/tile/bitstream/partition.cc
@@ -132,13 +132,13 @@
reader_.ReadSymbol<kMaxPartitionTypes>(partition_cdf));
}
} else if (has_columns) {
- uint16_t cdf[3] = {
- PartitionCdfGatherVerticalAlike(partition_cdf, block_size), 0, 0};
+ const uint16_t cdf =
+ PartitionCdfGatherVerticalAlike(partition_cdf, block_size);
*partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
: kPartitionHorizontal;
} else {
- uint16_t cdf[3] = {
- PartitionCdfGatherHorizontalAlike(partition_cdf, block_size), 0, 0};
+ const uint16_t cdf =
+ PartitionCdfGatherHorizontalAlike(partition_cdf, block_size);
*partition = reader_.ReadSymbolWithoutCdfUpdate(cdf) ? kPartitionSplit
: kPartitionVertical;
}
diff --git a/libgav1/src/tile/bitstream/transform_size.cc b/libgav1/src/tile/bitstream/transform_size.cc
index c5ee757..b79851d 100644
--- a/libgav1/src/tile/bitstream/transform_size.cc
+++ b/libgav1/src/tile/bitstream/transform_size.cc
@@ -117,9 +117,11 @@
const auto context = static_cast<int>(top_width >= max_tx_width) +
static_cast<int>(left_height >= max_tx_height);
const int cdf_index = kTxDepthCdfIndex[block.size];
- const int symbol_count = 3 - static_cast<int>(cdf_index == 0);
- const int tx_depth = reader_.ReadSymbol(
- symbol_decoder_context_.tx_depth_cdf[cdf_index][context], symbol_count);
+ uint16_t* const cdf =
+ symbol_decoder_context_.tx_depth_cdf[cdf_index][context];
+ const int tx_depth = (cdf_index == 0)
+ ? static_cast<int>(reader_.ReadSymbol(cdf))
+ : reader_.ReadSymbol<3>(cdf);
assert(tx_depth < 3);
TransformSize tx_size = max_rect_tx_size;
if (tx_depth == 0) return tx_size;
diff --git a/libgav1/src/tile/prediction.cc b/libgav1/src/tile/prediction.cc
index a234a19..c5560a6 100644
--- a/libgav1/src/tile/prediction.cc
+++ b/libgav1/src/tile/prediction.cc
@@ -45,6 +45,8 @@
// Import all the constants in the anonymous namespace.
#include "src/inter_intra_masks.inc"
+// Precision bits when scaling reference frames.
+constexpr int kReferenceScaleShift = 14;
constexpr int kAngleStep = 3;
constexpr int kPredictionModeToAngle[kIntraPredictionModesUV] = {
0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0};
@@ -404,20 +406,13 @@
const int subsampling_x = subsampling_x_[plane];
const int subsampling_y = subsampling_y_[plane];
if (block.top_available[plane]) {
- const int row =
- block.row4x4 - 1 -
- static_cast<int>(subsampling_y != 0 && (block.row4x4 & 1) != 0);
- const int column =
- block.column4x4 +
- static_cast<int>(subsampling_x != 0 && (block.column4x4 & 1) == 0);
+ const int row = block.row4x4 - 1 - (block.row4x4 & subsampling_y);
+ const int column = block.column4x4 + (~block.column4x4 & subsampling_x);
if (IsSmoothPrediction(row, column, plane)) return 1;
}
if (block.left_available[plane]) {
- const int row = block.row4x4 + static_cast<int>(subsampling_y != 0 &&
- (block.row4x4 & 1) == 0);
- const int column =
- block.column4x4 - 1 -
- static_cast<int>(subsampling_x != 0 && (block.column4x4 & 1) != 0);
+ const int row = block.row4x4 + (~block.row4x4 & subsampling_y);
+ const int column = block.column4x4 - 1 - (block.column4x4 & subsampling_x);
if (IsSmoothPrediction(row, column, plane)) return 1;
}
return 0;
@@ -945,6 +940,68 @@
width, height, dest, dest_stride);
}
+void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
+ const int reference_frame_index, const int x,
+ const int y, int* const start_x,
+ int* const start_y, int* const step_x,
+ int* const step_y) {
+ const int reference_upscaled_width =
+ (reference_frame_index == -1)
+ ? frame_header_.upscaled_width
+ : reference_frames_[reference_frame_index]->upscaled_width();
+ const int reference_height =
+ (reference_frame_index == -1)
+ ? frame_header_.height
+ : reference_frames_[reference_frame_index]->frame_height();
+ assert(2 * frame_header_.width >= reference_upscaled_width &&
+ 2 * frame_header_.height >= reference_height &&
+ frame_header_.width <= 16 * reference_upscaled_width &&
+ frame_header_.height <= 16 * reference_height);
+ const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
+ const bool is_scaled_y = reference_height != frame_header_.height;
+ const int half_sample = 1 << (kSubPixelBits - 1);
+ int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
+ int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
+ const int rounding_offset =
+ DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
+ if (is_scaled_x) {
+ const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
+ DivideBy2(frame_header_.width)) /
+ frame_header_.width;
+ *step_x = RightShiftWithRoundingSigned(
+ scale_x, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_x += half_sample;
+ // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
+ // be up to 15 bits. So we use int64_t to hold base_x.
+ const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
+ (half_sample << kReferenceScaleShift);
+ *start_x =
+ RightShiftWithRoundingSigned(
+ base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_x = 1 << kScaleSubPixelBits;
+ *start_x = LeftShift(orig_x, 6) + rounding_offset;
+ }
+ if (is_scaled_y) {
+ const int scale_y = ((reference_height << kReferenceScaleShift) +
+ DivideBy2(frame_header_.height)) /
+ frame_header_.height;
+ *step_y = RightShiftWithRoundingSigned(
+ scale_y, kReferenceScaleShift - kScaleSubPixelBits);
+ orig_y += half_sample;
+ const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
+ (half_sample << kReferenceScaleShift);
+ *start_y =
+ RightShiftWithRoundingSigned(
+ base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
+ rounding_offset;
+ } else {
+ *step_y = 1 << kScaleSubPixelBits;
+ *start_y = LeftShift(orig_y, 6) + rounding_offset;
+ }
+}
+
// static.
bool Tile::GetReferenceBlockPosition(
const int reference_frame_index, const bool is_scaled, const int width,
@@ -1007,12 +1064,9 @@
kScaleSubPixelBits) +
kSubPixelTaps;
}
- const int copy_start_x =
- std::min(std::max(ref_block_start_x, ref_start_x), ref_last_x);
- const int copy_end_x =
- std::max(std::min(ref_block_end_x, ref_last_x), copy_start_x);
- const int copy_start_y =
- std::min(std::max(ref_block_start_y, ref_start_y), ref_last_y);
+ const int copy_start_x = Clip3(ref_block_start_x, ref_start_x, ref_last_x);
+ const int copy_start_y = Clip3(ref_block_start_y, ref_start_y, ref_last_y);
+ const int copy_end_x = Clip3(ref_block_end_x, copy_start_x, ref_last_x);
const int block_width = copy_end_x - copy_start_x + 1;
const bool extend_left = ref_block_start_x < ref_start_x;
const bool extend_right = ref_block_end_x > ref_last_x;
@@ -1184,12 +1238,6 @@
kConvolveBorderLeftTop * pixel_size);
}
- const int has_horizontal_filter = static_cast<int>(
- ((mv.mv[MotionVector::kColumn] * (1 << (1 - subsampling_x))) &
- kSubPixelMask) != 0);
- const int has_vertical_filter = static_cast<int>(
- ((mv.mv[MotionVector::kRow] * (1 << (1 - subsampling_y))) &
- kSubPixelMask) != 0);
void* const output =
(is_compound || is_inter_intra) ? prediction : static_cast<void*>(dest);
ptrdiff_t output_stride = (is_compound || is_inter_intra)
@@ -1214,14 +1262,17 @@
vertical_filter_index, start_x, start_y, step_x, step_y,
width, height, output, output_stride);
} else {
+ const int horizontal_filter_id = (start_x >> 6) & kSubPixelMask;
+ const int vertical_filter_id = (start_y >> 6) & kSubPixelMask;
+
dsp::ConvolveFunc convolve_func =
dsp_.convolve[reference_frame_index == -1][is_compound]
- [has_vertical_filter][has_horizontal_filter];
+ [vertical_filter_id != 0][horizontal_filter_id != 0];
assert(convolve_func != nullptr);
convolve_func(block_start, convolve_buffer_stride, horizontal_filter_index,
- vertical_filter_index, start_x, start_y, width, height,
- output, output_stride);
+ vertical_filter_index, horizontal_filter_id,
+ vertical_filter_id, width, height, output, output_stride);
}
return true;
}
diff --git a/libgav1/src/tile/tile.cc b/libgav1/src/tile/tile.cc
index f79158f..9699517 100644
--- a/libgav1/src/tile/tile.cc
+++ b/libgav1/src/tile/tile.cc
@@ -40,11 +40,8 @@
namespace {
// Import all the constants in the anonymous namespace.
-#include "src/quantizer_tables.inc"
#include "src/scan_tables.inc"
-// Precision bits when scaling reference frames.
-constexpr int kReferenceScaleShift = 14;
// Range above kNumQuantizerBaseLevels which the exponential golomb coding
// process is activated.
constexpr int kQuantizerCoefficientBaseRange = 12;
@@ -422,6 +419,7 @@
RefCountedBuffer* const current_frame, const DecoderState& state,
FrameScratchBuffer* const frame_scratch_buffer,
const WedgeMaskArray& wedge_masks,
+ const QuantizerMatrix& quantizer_matrix,
SymbolDecoderContext* const saved_symbol_decoder_context,
const SegmentationMap* prev_segment_ids,
PostFilter* const post_filter, const dsp::Dsp* const dsp,
@@ -446,6 +444,7 @@
motion_field_(frame_scratch_buffer->motion_field),
reference_order_hint_(state.reference_order_hint),
wedge_masks_(wedge_masks),
+ quantizer_matrix_(quantizer_matrix),
reader_(data_, size_, frame_header_.enable_cdf_update),
symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
saved_symbol_decoder_context_(saved_symbol_decoder_context),
@@ -503,7 +502,7 @@
memset(delta_lf_, 0, sizeof(delta_lf_));
delta_lf_all_zero_ = true;
const YuvBuffer& buffer = post_filter_.frame_buffer();
- for (int plane = 0; plane < PlaneCount(); ++plane) {
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
// Verify that the borders are big enough for Reconstruct(). max_tx_length
// is the maximum value of tx_width and tx_height for the plane.
const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
@@ -543,12 +542,12 @@
buffer.stride(plane),
post_filter_.GetUnfilteredBuffer(plane));
const int plane_height =
- RightShiftWithRounding(frame_header_.height, subsampling_y_[plane]);
+ SubsampledValue(frame_header_.height, subsampling_y_[plane]);
deblock_row_limit_[plane] =
std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
<< subsampling_y_[plane]);
const int plane_width =
- RightShiftWithRounding(frame_header_.width, subsampling_x_[plane]);
+ SubsampledValue(frame_header_.width, subsampling_x_[plane]);
deblock_column_limit_[plane] =
std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
<< subsampling_x_[plane]);
@@ -610,7 +609,7 @@
const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
column4x4 += block_width4x4) {
- if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4, scratch_buffer,
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
processing_mode)) {
LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
row4x4, column4x4);
@@ -643,9 +642,6 @@
}
bool Tile::ParseAndDecode() {
- // If this is the main thread, we build the loop filter bit masks when parsing
- // so that it happens in the current thread. This ensures that the main thread
- // does as much work as possible.
if (split_parse_and_decode_) {
if (!ThreadedParseAndDecode()) return false;
SaveSymbolDecoderContext();
@@ -777,8 +773,8 @@
for (int column4x4 = column4x4_start_, column_index = 0;
column4x4 < column4x4_end_;
column4x4 += block_width4x4, ++column_index) {
- if (!ProcessSuperBlock(row4x4, column4x4, block_width4x4,
- scratch_buffer.get(), kProcessingModeParseOnly)) {
+ if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeParseOnly)) {
std::lock_guard<std::mutex> lock(threading_.mutex);
threading_.abort = true;
break;
@@ -863,8 +859,8 @@
tile_scratch_buffer_pool_->Get();
bool ok = scratch_buffer != nullptr;
if (ok) {
- ok = ProcessSuperBlock(row4x4, column4x4, block_width4x4,
- scratch_buffer.get(), kProcessingModeDecodeOnly);
+ ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
+ kProcessingModeDecodeOnly);
tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
}
std::unique_lock<std::mutex> lock(threading_.mutex);
@@ -921,7 +917,7 @@
const size_t pixel_size =
(sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
: sizeof(uint16_t));
- for (int plane = 0; plane < PlaneCount(); ++plane) {
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
const int row_to_copy =
(MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
const size_t pixels_to_copy =
@@ -1060,6 +1056,18 @@
if (bp.is_inter) {
cdf = symbol_decoder_context_
.inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
+ switch (tx_set) {
+ case kTransformSetInter1:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
+ break;
+ case kTransformSetInter2:
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
+ break;
+ default:
+ assert(tx_set == kTransformSetInter3);
+ tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
+ break;
+ }
} else {
const PredictionMode intra_direction =
block.bp->prediction_parameters->use_filter_intra
@@ -1069,9 +1077,12 @@
cdf =
symbol_decoder_context_
.intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
+ assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
+ tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
+ ? reader_.ReadSymbol<7>(cdf)
+ : reader_.ReadSymbol<5>(cdf));
}
- tx_type = static_cast<TransformType>(
- reader_.ReadSymbol(cdf, kNumTransformTypesInSet[tx_set]));
+
// This array does not contain an entry for kTransformSetDctOnly, so the
// first dimension needs to be offset by 1.
tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
@@ -1089,49 +1100,57 @@
// positions are still all 0s according to the diagonal scan order.
template <typename ResidualType>
void Tile::ReadCoeffBase2D(
- const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* const quantized_buffer) {
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
const int tx_width = 1 << adjusted_tx_width_log2;
- int i = eob - 2;
- do {
- constexpr auto threshold = static_cast<ResidualType>(3);
+ for (int i = eob - 2; i >= 1; --i) {
const uint16_t pos = scan[i];
const int row = pos >> adjusted_tx_width_log2;
const int column = pos & (tx_width - 1);
auto* const quantized = &quantized_buffer[pos];
- int context;
- if (pos == 0) {
- context = 0;
- } else {
- context = std::min(
- 4, DivideBy2(
- 1 + (std::min(quantized[1], threshold) + // {0, 1}
- std::min(quantized[tx_width], threshold) + // {1, 0}
- std::min(quantized[tx_width + 1], threshold) + // {1, 1}
- std::min(quantized[2], threshold) + // {0, 2}
- std::min(quantized[MultiplyBy2(tx_width)],
- threshold)))); // {2, 0}
- context += kCoeffBaseContextOffset[tx_size][std::min(row, 4)]
- [std::min(column, 4)];
- }
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
+ levels[tx_width + 1] + levels[2] +
+ levels[MultiplyBy2(tx_width)];
+ const int context =
+ ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
int level =
reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
if (level > kNumQuantizerBaseLevels) {
// No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
// + 1, because we clip the overall output to 6 and the unclipped
// quantized values will always result in an output of greater than 6.
- context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
- quantized[tx_width] + // {1, 0}
- quantized[tx_width + 1])); // {1, 1}
- if (pos != 0) {
- context += 14 >> static_cast<int>((row | column) < 2);
- }
- level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ context += 14 >> static_cast<int>((row | column) < 2);
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
}
quantized[0] = level;
- } while (--i >= 0);
+ }
+ // Read position 0.
+ {
+ auto* const quantized = &quantized_buffer[0];
+ int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
+ level_buffer[0] = level;
+ if (level > kNumQuantizerBaseLevels) {
+ // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
+ // + 1, because we clip the overall output to 6 and the unclipped
+ // quantized values will always result in an output of greater than 6.
+ const int context =
+ std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[tx_width + 1])); // {1, 1}
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
+ }
+ quantized[0] = level;
+ }
}
// Section 8.3.2 in the spec, under coeff_base and coeff_br.
@@ -1148,41 +1167,41 @@
// we always do the boundary check for its fourth right neighbor.
template <typename ResidualType>
void Tile::ReadCoeffBaseHorizontal(
- const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* const quantized_buffer) {
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
const int tx_width = 1 << adjusted_tx_width_log2;
int i = eob - 2;
do {
- constexpr auto threshold = static_cast<ResidualType>(3);
const uint16_t pos = scan[i];
const int column = pos & (tx_width - 1);
auto* const quantized = &quantized_buffer[pos];
- int context = std::min(
- 4,
- DivideBy2(1 +
- (std::min(quantized[1], threshold) + // {0, 1}
- std::min(quantized[tx_width], threshold) + // {1, 0}
- std::min(quantized[2], threshold) + // {0, 2}
- std::min(quantized[3], threshold) + // {0, 3}
- std::min(quantized[4],
- static_cast<ResidualType>(
- (column + 4 < tx_width) ? 3 : 0))))); // {0, 4}
- context += kCoeffBasePositionContextOffset[column];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (levels[1] + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[2] + // {0, 2}
+ levels[3] + // {0, 3}
+ ((column + 4 < tx_width) ? levels[4] : 0)); // {0, 4}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[column];
int level =
reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
if (level > kNumQuantizerBaseLevels) {
// No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
// + 1, because we clip the overall output to 6 and the unclipped
// quantized values will always result in an output of greater than 6.
- context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
- quantized[tx_width] + // {1, 0}
- quantized[2])); // {0, 2}
+ int context = std::min(6, DivideBy2(1 + quantized[1] + // {0, 1}
+ quantized[tx_width] + // {1, 0}
+ quantized[2])); // {0, 2}
if (pos != 0) {
context += 14 >> static_cast<int>(column == 0);
}
- level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
}
quantized[0] = level;
} while (--i >= 0);
@@ -1193,36 +1212,36 @@
// Right boundary check is performed explicitly.
template <typename ResidualType>
void Tile::ReadCoeffBaseVertical(
- const uint16_t* scan, PlaneType plane_type, TransformSize /*tx_size*/,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* const quantized_buffer) {
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
const int tx_width = 1 << adjusted_tx_width_log2;
int i = eob - 2;
do {
- constexpr auto threshold = static_cast<ResidualType>(3);
const uint16_t pos = scan[i];
const int row = pos >> adjusted_tx_width_log2;
const int column = pos & (tx_width - 1);
auto* const quantized = &quantized_buffer[pos];
- const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
- int context =
- std::min(4, DivideBy2(1 + (std::min(quantized_column1, 3) + // {0, 1}
- std::min(quantized[tx_width],
- threshold) + // {1, 0}
- std::min(quantized[MultiplyBy2(tx_width)],
- threshold) + // {2, 0}
- std::min(quantized[tx_width * 3],
- threshold) + // {3, 0}
- std::min(quantized[MultiplyBy4(tx_width)],
- threshold)))); // {4, 0}
- context += kCoeffBasePositionContextOffset[row];
+ auto* const levels = &level_buffer[pos];
+ const int neighbor_sum =
+ 1 + (((column + 1 < tx_width) ? levels[1] : 0) + // {0, 1}
+ levels[tx_width] + // {1, 0}
+ levels[MultiplyBy2(tx_width)] + // {2, 0}
+ levels[tx_width * 3] + // {3, 0}
+ levels[MultiplyBy4(tx_width)]); // {4, 0}
+ const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
+ kCoeffBasePositionContextOffset[row];
int level =
reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
+ levels[0] = level;
if (level > kNumQuantizerBaseLevels) {
// No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
// + 1, because we clip the overall output to 6 and the unclipped
// quantized values will always result in an output of greater than 6.
+ const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
int context =
std::min(6, DivideBy2(1 + quantized_column1 + // {0, 1}
quantized[tx_width] + // {1, 0}
@@ -1230,7 +1249,7 @@
if (pos != 0) {
context += 14 >> static_cast<int>(row == 0);
}
- level += ReadCoeffBaseRange(clamped_tx_size_context, context, plane_type);
+ level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
}
quantized[0] = level;
} while (--i >= 0);
@@ -1272,68 +1291,6 @@
num_left_elements);
}
-void Tile::ScaleMotionVector(const MotionVector& mv, const Plane plane,
- const int reference_frame_index, const int x,
- const int y, int* const start_x,
- int* const start_y, int* const step_x,
- int* const step_y) {
- const int reference_upscaled_width =
- (reference_frame_index == -1)
- ? frame_header_.upscaled_width
- : reference_frames_[reference_frame_index]->upscaled_width();
- const int reference_height =
- (reference_frame_index == -1)
- ? frame_header_.height
- : reference_frames_[reference_frame_index]->frame_height();
- assert(2 * frame_header_.width >= reference_upscaled_width &&
- 2 * frame_header_.height >= reference_height &&
- frame_header_.width <= 16 * reference_upscaled_width &&
- frame_header_.height <= 16 * reference_height);
- const bool is_scaled_x = reference_upscaled_width != frame_header_.width;
- const bool is_scaled_y = reference_height != frame_header_.height;
- const int half_sample = 1 << (kSubPixelBits - 1);
- int orig_x = (x << kSubPixelBits) + ((2 * mv.mv[1]) >> subsampling_x_[plane]);
- int orig_y = (y << kSubPixelBits) + ((2 * mv.mv[0]) >> subsampling_y_[plane]);
- const int rounding_offset =
- DivideBy2(1 << (kScaleSubPixelBits - kSubPixelBits));
- if (is_scaled_x) {
- const int scale_x = ((reference_upscaled_width << kReferenceScaleShift) +
- DivideBy2(frame_header_.width)) /
- frame_header_.width;
- *step_x = RightShiftWithRoundingSigned(
- scale_x, kReferenceScaleShift - kScaleSubPixelBits);
- orig_x += half_sample;
- // When frame size is 4k and above, orig_x can be above 16 bits, scale_x can
- // be up to 15 bits. So we use int64_t to hold base_x.
- const int64_t base_x = static_cast<int64_t>(orig_x) * scale_x -
- (half_sample << kReferenceScaleShift);
- *start_x =
- RightShiftWithRoundingSigned(
- base_x, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
- rounding_offset;
- } else {
- *step_x = 1 << kScaleSubPixelBits;
- *start_x = LeftShift(orig_x, 6) + rounding_offset;
- }
- if (is_scaled_y) {
- const int scale_y = ((reference_height << kReferenceScaleShift) +
- DivideBy2(frame_header_.height)) /
- frame_header_.height;
- *step_y = RightShiftWithRoundingSigned(
- scale_y, kReferenceScaleShift - kScaleSubPixelBits);
- orig_y += half_sample;
- const int64_t base_y = static_cast<int64_t>(orig_y) * scale_y -
- (half_sample << kReferenceScaleShift);
- *start_y =
- RightShiftWithRoundingSigned(
- base_y, kReferenceScaleShift + kSubPixelBits - kScaleSubPixelBits) +
- rounding_offset;
- } else {
- *step_y = 1 << kScaleSubPixelBits;
- *start_y = LeftShift(orig_y, 6) + rounding_offset;
- }
-}
-
template <typename ResidualType, bool is_dc_coefficient>
bool Tile::ReadSignAndApplyDequantization(
const uint16_t* const scan, int i, int q_value,
@@ -1395,13 +1352,11 @@
return true;
}
-int Tile::ReadCoeffBaseRange(int clamped_tx_size_context, int cdf_context,
- int plane_type) {
+int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
int level = 0;
for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
- const int coeff_base_range = reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(
- symbol_decoder_context_.coeff_base_range_cdf[clamped_tx_size_context]
- [plane_type][cdf_context]);
+ const int coeff_base_range =
+ reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
level += coeff_base_range;
if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
}
@@ -1442,6 +1397,11 @@
// Clear padding to avoid bottom boundary checks when parsing quantized
// coefficients.
memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
+ uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
+ memset(
+ level_buffer, 0,
+ kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
+ tx_padding);
const int clamped_tx_height = std::min(tx_height, 32);
if (plane == kPlaneY) {
ReadTransformType(block, x4, y4, tx_size);
@@ -1452,33 +1412,38 @@
const PlaneType plane_type = GetPlaneType(plane);
const TransformClass tx_class = GetTransformClass(*tx_type);
context = static_cast<int>(tx_class != kTransformClass2D);
- uint16_t* cdf;
+ int eob_pt = 1;
switch (eob_multi_size) {
case 0:
- cdf = symbol_decoder_context_.eob_pt_16_cdf[plane_type][context];
+ eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
+ symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
break;
case 1:
- cdf = symbol_decoder_context_.eob_pt_32_cdf[plane_type][context];
+ eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
+ symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
break;
case 2:
- cdf = symbol_decoder_context_.eob_pt_64_cdf[plane_type][context];
+ eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
+ symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
break;
case 3:
- cdf = symbol_decoder_context_.eob_pt_128_cdf[plane_type][context];
+ eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
+ symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
break;
case 4:
- cdf = symbol_decoder_context_.eob_pt_256_cdf[plane_type][context];
+ eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
+ symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
break;
case 5:
- cdf = symbol_decoder_context_.eob_pt_512_cdf[plane_type];
+ eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
+ symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
break;
case 6:
default:
- cdf = symbol_decoder_context_.eob_pt_1024_cdf[plane_type];
+ eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
+ symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
break;
}
- const int eob_pt =
- 1 + reader_.ReadSymbol(cdf, kEobPt16SymbolCount + eob_multi_size);
int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
if (eob_pt >= 3) {
context = eob_pt - 3;
@@ -1496,20 +1461,22 @@
}
const uint16_t* scan = kScan[tx_class][tx_size];
const int clamped_tx_size_context = std::min(tx_size_context, 3);
+ auto coeff_base_range_cdf =
+ symbol_decoder_context_
+ .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
// Read the last coefficient.
{
context = GetCoeffBaseContextEob(tx_size, eob - 1);
const uint16_t pos = scan[eob - 1];
int level =
- 1 + reader_.ReadSymbol(
+ 1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
symbol_decoder_context_
- .coeff_base_eob_cdf[tx_size_context][plane_type][context],
- kCoeffBaseEobSymbolCount);
+ .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
+ level_buffer[pos] = level;
if (level > kNumQuantizerBaseLevels) {
- level += ReadCoeffBaseRange(
- clamped_tx_size_context,
- GetCoeffBaseRangeContextEob(adjusted_tx_width_log2, pos, tx_class),
- plane_type);
+ level +=
+ ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
+ adjusted_tx_width_log2, pos, tx_class)]);
}
residual[pos] = level;
}
@@ -1518,18 +1485,19 @@
// Lookup used to call the right variant of ReadCoeffBase*() based on the
// transform class.
static constexpr void (Tile::*kGetCoeffBaseFunc[])(
- const uint16_t* scan, PlaneType plane_type, TransformSize tx_size,
- int clamped_tx_size_context, int adjusted_tx_width_log2, int eob,
+ const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
+ int eob,
uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
- ResidualType* quantized_buffer) = {
- &Tile::ReadCoeffBase2D<ResidualType>,
- &Tile::ReadCoeffBaseHorizontal<ResidualType>,
- &Tile::ReadCoeffBaseVertical<ResidualType>};
+ uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
+ [kCoeffBaseRangeSymbolCount + 1],
+ ResidualType* quantized_buffer,
+ uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
+ &Tile::ReadCoeffBaseHorizontal<ResidualType>,
+ &Tile::ReadCoeffBaseVertical<ResidualType>};
(this->*kGetCoeffBaseFunc[tx_class])(
- scan, plane_type, tx_size, clamped_tx_size_context,
- adjusted_tx_width_log2, eob,
+ scan, tx_size, adjusted_tx_width_log2, eob,
symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
- residual);
+ coeff_base_range_cdf, residual, level_buffer);
}
const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
const int current_quantizer_index = GetQIndex(
@@ -1542,8 +1510,9 @@
*tx_type < kTransformTypeIdentityIdentity &&
!frame_header_.segmentation.lossless[bp.segment_id] &&
frame_header_.quantizer.matrix_level[plane] < 15)
- ? &kQuantizerMatrix[frame_header_.quantizer.matrix_level[plane]]
- [plane_type][kQuantizerMatrixOffset[tx_size]]
+ ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
+ [plane_type][adjusted_tx_size]
+ .get()
: nullptr;
int coefficient_level = 0;
int8_t dc_category = 0;
@@ -1657,11 +1626,12 @@
const int sb_row_index = SuperBlockRowIndex(block.row4x4);
const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
if (mode == kProcessingModeDecodeOnly) {
- TransformParameterQueue& tx_params =
+ Queue<TransformParameters>& tx_params =
*residual_buffer_threaded_[sb_row_index][sb_column_index]
->transform_parameters();
ReconstructBlock(block, plane, start_x, start_y, tx_size,
- tx_params.Type(), tx_params.NonZeroCoeffCount());
+ tx_params.Front().type,
+ tx_params.Front().non_zero_coeff_count);
tx_params.Pop();
} else {
TransformType tx_type;
@@ -1684,7 +1654,7 @@
assert(mode == kProcessingModeParseOnly);
residual_buffer_threaded_[sb_row_index][sb_column_index]
->transform_parameters()
- ->Push(non_zero_coeff_count, tx_type);
+ ->Push(TransformParameters(tx_type, non_zero_coeff_count));
}
}
}
@@ -1793,8 +1763,9 @@
const BlockParameters& bp = *block.bp;
for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
- for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1);
- ++plane) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
const int subsampling_x = subsampling_x_[plane];
const int subsampling_y = subsampling_y_[plane];
// For Y Plane, when lossless is true |bp.transform_size| is always
@@ -1833,7 +1804,7 @@
}
}
}
- }
+ } while (++plane < num_planes);
}
}
return true;
@@ -1913,6 +1884,7 @@
GetClampParameters(block, min, max);
BlockParameters& bp = *block.bp;
const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
+ bp.mv.mv64 = 0;
if (is_compound) {
for (int i = 0; i < 2; ++i) {
const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
@@ -1975,6 +1947,7 @@
BlockParameters& bp = *block.bp;
const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
+ bp.mv.mv64 = 0;
ReadMotionVector(block, 0);
if (ref_mv_0.mv32 == 0) {
const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
@@ -1998,7 +1971,9 @@
}
void Tile::ResetEntropyContext(const Block& block) {
- for (int plane = 0; plane < (block.HasChroma() ? PlaneCount() : 1); ++plane) {
+ const int num_planes = block.HasChroma() ? PlaneCount() : 1;
+ int plane = kPlaneY;
+ do {
const int subsampling_x = subsampling_x_[plane];
const int start_x = block.column4x4 >> subsampling_x;
const int end_x =
@@ -2017,7 +1992,7 @@
end_y - start_y);
memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
end_y - start_y);
- }
+ } while (++plane < num_planes);
}
bool Tile::ComputePrediction(const Block& block) {
@@ -2036,7 +2011,7 @@
bool is_local_valid = false;
// Local warping parameters, similar usage as is_local_valid.
GlobalMotion local_warp_params;
- int plane = 0;
+ int plane = kPlaneY;
do {
const int8_t subsampling_x = subsampling_x_[plane];
const int8_t subsampling_y = subsampling_y_[plane];
@@ -2147,7 +2122,6 @@
}
bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
- ParameterTree* const tree,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
// Do not process the block if the starting point is beyond the visible frame.
@@ -2158,8 +2132,24 @@
column4x4 >= frame_header_.columns4x4) {
return true;
}
- BlockParameters& bp = *tree->parameters();
- block_parameters_holder_.FillCache(row4x4, column4x4, block_size, &bp);
+
+ if (split_parse_and_decode_) {
+ // Push block ordering info to the queue. DecodeBlock() will use this queue
+ // to decode the blocks in the correct order.
+ const int sb_row_index = SuperBlockRowIndex(row4x4);
+ const int sb_column_index = SuperBlockColumnIndex(column4x4);
+ residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order()
+ ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
+ }
+
+ BlockParameters* bp_ptr =
+ block_parameters_holder_.Get(row4x4, column4x4, block_size);
+ if (bp_ptr == nullptr) {
+ LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
+ return false;
+ }
+ BlockParameters& bp = *bp_ptr;
Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
bp.size = block_size;
bp.prediction_parameters =
@@ -2211,16 +2201,13 @@
return true;
}
-bool Tile::DecodeBlock(ParameterTree* const tree,
+bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
- const int row4x4 = tree->row4x4();
- const int column4x4 = tree->column4x4();
if (row4x4 >= frame_header_.rows4x4 ||
column4x4 >= frame_header_.columns4x4) {
return true;
}
- const BlockSize block_size = tree->block_size();
Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
if (!ComputePrediction(block) ||
!Residual(block, kProcessingModeDecodeOnly)) {
@@ -2231,27 +2218,22 @@
}
bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
- ParameterTree* const root,
TileScratchBuffer* const scratch_buffer,
ResidualPtr* residual) {
- Stack<ParameterTree*, kDfsStackSize> stack;
+ Stack<PartitionTreeNode, kDfsStackSize> stack;
// Set up the first iteration.
- ParameterTree* node = root;
- int row4x4 = row4x4_start;
- int column4x4 = column4x4_start;
- BlockSize block_size = SuperBlockSize();
+ stack.Push(
+ PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
// DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
// Otherwise, the children are pushed into the stack for future processing.
do {
- if (!stack.Empty()) {
- // Set up subsequent iterations.
- node = stack.Pop();
- row4x4 = node->row4x4();
- column4x4 = node->column4x4();
- block_size = node->block_size();
- }
+ PartitionTreeNode node = stack.Pop();
+ int row4x4 = node.row4x4;
+ int column4x4 = node.column4x4;
+ BlockSize block_size = node.block_size;
+
if (row4x4 >= frame_header_.rows4x4 ||
column4x4 >= frame_header_.columns4x4) {
continue;
@@ -2287,13 +2269,13 @@
sequence_header_.color_config.subsampling_y);
return false;
}
- if (!node->SetPartitionType(partition)) {
- LIBGAV1_DLOG(ERROR, "node->SetPartitionType() failed.");
- return false;
- }
+
+ const int quarter_block4x4 = half_block4x4 >> 1;
+ const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
+ assert(partition == kPartitionNone || sub_size != kBlockInvalid);
switch (partition) {
case kPartitionNone:
- if (!ProcessBlock(row4x4, column4x4, sub_size, node, scratch_buffer,
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
residual)) {
return false;
}
@@ -2301,28 +2283,82 @@
case kPartitionSplit:
// The children must be added in reverse order since a stack is being
// used.
- for (int i = 3; i >= 0; --i) {
- ParameterTree* const child = node->children(i);
- assert(child != nullptr);
- stack.Push(child);
- }
+ stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
+ column4x4 + half_block4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
+ stack.Push(
+ PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
+ stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
break;
case kPartitionHorizontal:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVertical:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontalWithTopSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontalWithBottomSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVerticalWithLeftSplit:
+ if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionVerticalWithRightSplit:
+ if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
+ residual) ||
+ !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
+ scratch_buffer, residual) ||
+ !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
+ split_size, scratch_buffer, residual)) {
+ return false;
+ }
+ break;
case kPartitionHorizontal4:
+ for (int i = 0; i < 4; ++i) {
+ if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
+ scratch_buffer, residual)) {
+ return false;
+ }
+ }
+ break;
case kPartitionVertical4:
for (int i = 0; i < 4; ++i) {
- ParameterTree* const child = node->children(i);
- // Once a null child is seen, all the subsequent children will also be
- // null.
- if (child == nullptr) break;
- if (!ProcessBlock(child->row4x4(), child->column4x4(),
- child->block_size(), child, scratch_buffer,
- residual)) {
+ if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
+ scratch_buffer, residual)) {
return false;
}
}
@@ -2367,7 +2403,7 @@
sizeof(scratch_buffer->block_decoded));
// Set specific edge cases to true.
const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
- for (int plane = 0; plane < PlaneCount(); ++plane) {
+ for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
const int subsampling_x = subsampling_x_[plane];
const int subsampling_y = subsampling_y_[plane];
const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
@@ -2395,7 +2431,7 @@
}
}
-bool Tile::ProcessSuperBlock(int row4x4, int column4x4, int block_width4x4,
+bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
TileScratchBuffer* const scratch_buffer,
ProcessingMode mode) {
const bool parsing =
@@ -2413,13 +2449,10 @@
if (parsing) {
ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
}
- const int row = row4x4 / block_width4x4;
- const int column = column4x4 / block_width4x4;
if (parsing && decoding) {
uint8_t* residual_buffer = residual_buffer_.get();
- if (!ProcessPartition(row4x4, column4x4,
- block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
column4x4);
return false;
@@ -2437,18 +2470,14 @@
}
uint8_t* residual_buffer =
residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
- if (!ProcessPartition(row4x4, column4x4,
- block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
+ &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
column4x4);
return false;
}
} else {
- uint8_t* residual_buffer =
- residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
- if (!DecodeSuperBlock(block_parameters_holder_.Tree(row, column),
- scratch_buffer, &residual_buffer)) {
+ if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
row4x4, column4x4);
return false;
@@ -2459,26 +2488,23 @@
return true;
}
-bool Tile::DecodeSuperBlock(ParameterTree* const tree,
- TileScratchBuffer* const scratch_buffer,
- ResidualPtr* residual) {
- Stack<ParameterTree*, kDfsStackSize> stack;
- stack.Push(tree);
- do {
- ParameterTree* const node = stack.Pop();
- if (node->partition() != kPartitionNone) {
- for (int i = 3; i >= 0; --i) {
- if (node->children(i) == nullptr) continue;
- stack.Push(node->children(i));
- }
- continue;
- }
- if (!DecodeBlock(node, scratch_buffer, residual)) {
+bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
+ TileScratchBuffer* const scratch_buffer) {
+ uint8_t* residual_buffer =
+ residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
+ Queue<PartitionTreeNode>& partition_tree_order =
+ *residual_buffer_threaded_[sb_row_index][sb_column_index]
+ ->partition_tree_order();
+ while (!partition_tree_order.Empty()) {
+ PartitionTreeNode block = partition_tree_order.Front();
+ if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
+ scratch_buffer, &residual_buffer)) {
LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
- node->row4x4(), node->column4x4());
+ block.row4x4, block.column4x4);
return false;
}
- } while (!stack.Empty());
+ partition_tree_order.Pop();
+ }
return true;
}
diff --git a/libgav1/src/utils/array_2d.h b/libgav1/src/utils/array_2d.h
index 2df6241..df2da9f 100644
--- a/libgav1/src/utils/array_2d.h
+++ b/libgav1/src/utils/array_2d.h
@@ -120,7 +120,7 @@
const T* operator[](int row) const { return data_view_[row]; }
private:
- std::unique_ptr<T[]> data_ = nullptr;
+ std::unique_ptr<T[]> data_;
size_t allocated_size_ = 0;
size_t size_ = 0;
Array2DView<T> data_view_;
diff --git a/libgav1/src/utils/block_parameters_holder.cc b/libgav1/src/utils/block_parameters_holder.cc
index 79bb2b8..3bb9f1e 100644
--- a/libgav1/src/utils/block_parameters_holder.cc
+++ b/libgav1/src/utils/block_parameters_holder.cc
@@ -19,53 +19,29 @@
#include "src/utils/common.h"
#include "src/utils/constants.h"
#include "src/utils/logging.h"
-#include "src/utils/parameter_tree.h"
#include "src/utils/types.h"
namespace libgav1 {
-namespace {
-
-// Returns the number of super block rows/columns for |value4x4| where value4x4
-// is either rows4x4 or columns4x4.
-int RowsOrColumns4x4ToSuperBlocks(int value4x4, bool use_128x128_superblock) {
- return use_128x128_superblock ? DivideBy128(MultiplyBy4(value4x4) + 127)
- : DivideBy64(MultiplyBy4(value4x4) + 63);
-}
-
-} // namespace
-
-bool BlockParametersHolder::Reset(int rows4x4, int columns4x4,
- bool use_128x128_superblock) {
+bool BlockParametersHolder::Reset(int rows4x4, int columns4x4) {
rows4x4_ = rows4x4;
columns4x4_ = columns4x4;
- use_128x128_superblock_ = use_128x128_superblock;
- if (!block_parameters_cache_.Reset(rows4x4_, columns4x4_)) {
- LIBGAV1_DLOG(ERROR, "block_parameters_cache_.Reset() failed.");
- return false;
+ index_ = 0;
+ return block_parameters_cache_.Reset(rows4x4_, columns4x4_) &&
+ block_parameters_.Resize(rows4x4_ * columns4x4_);
+}
+
+BlockParameters* BlockParametersHolder::Get(int row4x4, int column4x4,
+ BlockSize block_size) {
+ const size_t index = index_.fetch_add(1, std::memory_order_relaxed);
+ if (index >= block_parameters_.size()) return nullptr;
+ auto& bp = block_parameters_.get()[index];
+ if (bp == nullptr) {
+ bp.reset(new (std::nothrow) BlockParameters);
+ if (bp == nullptr) return nullptr;
}
- const int rows =
- RowsOrColumns4x4ToSuperBlocks(rows4x4_, use_128x128_superblock_);
- const int columns =
- RowsOrColumns4x4ToSuperBlocks(columns4x4_, use_128x128_superblock_);
- const BlockSize sb_size =
- use_128x128_superblock_ ? kBlock128x128 : kBlock64x64;
- const int multiplier = kNum4x4BlocksWide[sb_size];
- if (!trees_.Reset(rows, columns)) {
- LIBGAV1_DLOG(ERROR, "trees_.Reset() failed.");
- return false;
- }
- for (int i = 0; i < rows; ++i) {
- for (int j = 0; j < columns; ++j) {
- trees_[i][j] =
- ParameterTree::Create(i * multiplier, j * multiplier, sb_size);
- if (trees_[i][j] == nullptr) {
- LIBGAV1_DLOG(ERROR, "Allocation of trees_[%d][%d] failed.", i, j);
- return false;
- }
- }
- }
- return true;
+ FillCache(row4x4, column4x4, block_size, bp.get());
+ return bp.get();
}
void BlockParametersHolder::FillCache(int row4x4, int column4x4,
diff --git a/libgav1/src/utils/block_parameters_holder.h b/libgav1/src/utils/block_parameters_holder.h
index 35543c3..ca36907 100644
--- a/libgav1/src/utils/block_parameters_holder.h
+++ b/libgav1/src/utils/block_parameters_holder.h
@@ -17,18 +17,18 @@
#ifndef LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
#define LIBGAV1_SRC_UTILS_BLOCK_PARAMETERS_HOLDER_H_
+#include <atomic>
#include <memory>
#include "src/utils/array_2d.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
-#include "src/utils/parameter_tree.h"
+#include "src/utils/dynamic_buffer.h"
#include "src/utils/types.h"
namespace libgav1 {
-// Holds a 2D array of |ParameterTree| objects. Each tree stores the parameters
-// corresponding to a superblock.
+// Holds the BlockParameters pointers to each 4x4 block in the frame.
class BlockParametersHolder {
public:
BlockParametersHolder() = default;
@@ -37,10 +37,13 @@
BlockParametersHolder(const BlockParametersHolder&) = delete;
BlockParametersHolder& operator=(const BlockParametersHolder&) = delete;
- // If |use_128x128_superblock| is true, 128x128 superblocks will be used,
- // otherwise 64x64 superblocks will be used.
- LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4,
- bool use_128x128_superblock);
+ LIBGAV1_MUST_USE_RESULT bool Reset(int rows4x4, int columns4x4);
+
+ // Returns a pointer to a BlockParameters object that can be used safely until
+ // the next call to Reset(). Returns nullptr on memory allocation failure. It
+ // also fills the cache matrix for the block starting at |row4x4|, |column4x4|
+ // of size |block_size| with the returned pointer.
+ BlockParameters* Get(int row4x4, int column4x4, BlockSize block_size);
// Finds the BlockParameters corresponding to |row4x4| and |column4x4|. This
// is done as a simple look up of the |block_parameters_cache_| matrix.
@@ -59,20 +62,24 @@
int columns4x4() const { return columns4x4_; }
- // Returns the ParameterTree corresponding to superblock starting at (|row|,
- // |column|).
- ParameterTree* Tree(int row, int column) { return trees_[row][column].get(); }
+ private:
+ // Needs access to FillCache for testing Cdef.
+ template <int bitdepth, typename Pixel>
+ friend class PostFilterApplyCdefTest;
- // Fills the cache matrix for the block starting at |row4x4|, |column4x4| of
- // size |block_size| with the pointer |bp|.
void FillCache(int row4x4, int column4x4, BlockSize block_size,
BlockParameters* bp);
- private:
int rows4x4_ = 0;
int columns4x4_ = 0;
- bool use_128x128_superblock_ = false;
- Array2D<std::unique_ptr<ParameterTree>> trees_;
+
+ // Owns the memory of BlockParameters pointers for the entire frame. It can
+ // hold upto |rows4x4_| * |columns4x4_| objects. Each object will be allocated
+ // on demand and re-used across frames.
+ DynamicBuffer<std::unique_ptr<BlockParameters>> block_parameters_;
+
+ // Points to the next available index of |block_parameters_|.
+ std::atomic<int> index_;
// This is a 2d array of size |rows4x4_| * |columns4x4_|. This is filled in by
// FillCache() and used by Find() to perform look ups using exactly one look
diff --git a/libgav1/src/utils/common.h b/libgav1/src/utils/common.h
index 8caad2e..2e599f0 100644
--- a/libgav1/src/utils/common.h
+++ b/libgav1/src/utils/common.h
@@ -30,12 +30,12 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
-#include <cstdlib>
#include <cstring>
#include <type_traits>
#include "src/utils/bit_mask_set.h"
#include "src/utils/constants.h"
+#include "src/utils/memory.h"
#include "src/utils/types.h"
namespace libgav1 {
@@ -58,6 +58,17 @@
return value < low ? low : (value > high ? high : value);
}
+template <typename Pixel>
+void ExtendLine(void* const line_start, const int width, const int left,
+ const int right) {
+ auto* const start = static_cast<Pixel*>(line_start);
+ const Pixel* src = start;
+ Pixel* dst = start - left;
+ // Copy to left and right borders.
+ Memset(dst, src[0], left);
+ Memset(dst + left + width, src[width - 1], right);
+}
+
// The following 2 templates set a block of data with uncontiguous memory to
// |value|. The compilers usually generate several branches to handle different
// cases of |columns| when inlining memset() and std::fill(), and these branches
@@ -110,7 +121,7 @@
const unsigned char bit_set = _BitScanReverse(&first_set_bit, n);
assert(bit_set != 0);
static_cast<void>(bit_set);
- return 31 - static_cast<int>(first_set_bit);
+ return 31 ^ static_cast<int>(first_set_bit);
}
inline int CountLeadingZeros(uint64_t n) {
@@ -119,20 +130,20 @@
#if defined(HAVE_BITSCANREVERSE64)
const unsigned char bit_set =
_BitScanReverse64(&first_set_bit, static_cast<unsigned __int64>(n));
-#else // !defined(HAVE_BITSCANREVERSE64)
+#else // !defined(HAVE_BITSCANREVERSE64)
const auto n_hi = static_cast<unsigned long>(n >> 32); // NOLINT(runtime/int)
if (n_hi != 0) {
const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi);
assert(bit_set != 0);
static_cast<void>(bit_set);
- return 31 - static_cast<int>(first_set_bit);
+ return 31 ^ static_cast<int>(first_set_bit);
}
const unsigned char bit_set = _BitScanReverse(
&first_set_bit, static_cast<unsigned long>(n)); // NOLINT(runtime/int)
#endif // defined(HAVE_BITSCANREVERSE64)
assert(bit_set != 0);
static_cast<void>(bit_set);
- return 63 - static_cast<int>(first_set_bit);
+ return 63 ^ static_cast<int>(first_set_bit);
}
#undef HAVE_BITSCANREVERSE64
@@ -185,22 +196,22 @@
inline int FloorLog2(int32_t n) {
assert(n > 0);
- return 31 - CountLeadingZeros(static_cast<uint32_t>(n));
+ return 31 ^ CountLeadingZeros(static_cast<uint32_t>(n));
}
inline int FloorLog2(uint32_t n) {
assert(n > 0);
- return 31 - CountLeadingZeros(n);
+ return 31 ^ CountLeadingZeros(n);
}
inline int FloorLog2(int64_t n) {
assert(n > 0);
- return 63 - CountLeadingZeros(static_cast<uint64_t>(n));
+ return 63 ^ CountLeadingZeros(static_cast<uint64_t>(n));
}
inline int FloorLog2(uint64_t n) {
assert(n > 0);
- return 63 - CountLeadingZeros(n);
+ return 63 ^ CountLeadingZeros(n);
}
inline int CeilLog2(unsigned int n) {
@@ -211,8 +222,9 @@
return (n < 2) ? 0 : FloorLog2(n - 1) + 1;
}
-constexpr int Ceil(int dividend, int divisor) {
- return dividend / divisor + static_cast<int>(dividend % divisor != 0);
+inline int RightShiftWithCeiling(int value, int bits) {
+ assert(bits > 0);
+ return (value + (1 << bits) - 1) >> bits;
}
inline int32_t RightShiftWithRounding(int32_t value, int bits) {
@@ -363,7 +375,7 @@
// behavior and result apply to other CPUs' SIMD instructions.
inline int GetRelativeDistance(const unsigned int a, const unsigned int b,
const unsigned int order_hint_shift_bits) {
- const int diff = a - b;
+ const int diff = static_cast<int>(a) - static_cast<int>(b);
assert(order_hint_shift_bits <= 31);
if (order_hint_shift_bits == 0) {
assert(a == 0);
@@ -510,6 +522,8 @@
return filter_index;
}
+// This has identical results as RightShiftWithRounding since |subsampling| can
+// only be 0 or 1.
constexpr int SubsampledValue(int value, int subsampling) {
return (value + subsampling) >> subsampling;
}
diff --git a/libgav1/src/utils/constants.cc b/libgav1/src/utils/constants.cc
index 97959fa..80d7acb 100644
--- a/libgav1/src/utils/constants.cc
+++ b/libgav1/src/utils/constants.cc
@@ -871,311 +871,4 @@
const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes] = {
{0, 1}, {2, 2}, {3, 3}};
-const int8_t kMaskIdLookup[4][kMaxBlockSizes] = {
- // transform size 4x4.
- {0, 1, 13, 2, 3, 4, 15, 14, 5, 6, 7,
- 17, 16, 8, 9, 10, 18, 11, 12, -1, -1, -1},
- // transform size 8x8.
- {-1, -1, -1, -1, 19, 20, 29, -1, 21, 22, 23,
- 31, 30, 24, 25, 26, 32, 27, 28, -1, -1, -1},
- // transform size 16x16.
- {-1, -1, -1, -1, -1, -1, -1, -1, -1, 33, 34,
- 40, -1, 35, 36, 37, 41, 38, 39, -1, -1, -1},
- // transform size 32x32.
- {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, 42, 43, -1, 44, 45, -1, -1, -1},
-};
-
-const int8_t kVerticalBorderMaskIdLookup[kMaxBlockSizes] = {
- 0, 47, 61, 49, 19, 51, 63, 62, 53, 33, 55,
- 65, 64, 57, 42, 59, 66, 60, 46, -1, -1, -1};
-
-const uint64_t kTopMaskLookup[67][4] = {
- // transform size 4X4
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X4, transform size 4X4
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X8, transform size 4X4
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X4, transform size 4X4
- {0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X8, transform size 4X4
- {0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 4X4
- {0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 4X4
- {0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 4X4
- {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
- 0x00ff00ff00ff00ffULL}, // block size 32X64, transform size 4X4
- {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 4X4
- {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
- 0xffffffffffffffffULL}, // block size 64X64, transform size 4x4
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X4
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 4X4
- {0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 4X4
- {0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 4X4
- {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
- 0x000f000f000f000fULL}, // block size 16X64, transform size 4X4
- {0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 4X4
- // transform size 8X8
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X8, transform size 8X8
- {0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 8X8
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 8X8
- {0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 8X8
- {0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 8X8
- {0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 8X8
- {0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 8X8
- {0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
- 0x000000ff000000ffULL}, // block size 32X64, transform size 8X8
- {0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 8X8
- {0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
- 0x0000ffff0000ffffULL}, // block size 64X64, transform size 8X8
- {0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X8
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 8X8
- {0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
- 0x0000000f0000000fULL}, // block size 16X64, transform size 8X8
- {0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 8X8
- // transform size 16X16
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 16X16
- {0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 16X16
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 16X16
- {0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 16X16
- {0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
- 0x00000000000000ffULL}, // block size 32X64, transform size 16X16
- {0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 16X16
- {0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
- 0x000000000000ffffULL}, // block size 64X64, transform size 16X16
- {0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
- 0x000000000000000fULL}, // block size 16X64, transform size 16X16
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 16X16
- // transform size 32X32
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 32X32
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
- 0x0000000000000000ULL}, // block size 32X64, transform size 32X32
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 32X32
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
- 0x0000000000000000ULL}, // block size 64X64, transform size 32X32
- // transform size 64X64
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X64, transform size 64X64
- // 2:1, 1:2 transform sizes.
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X8, transform size 4X8
- {0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X8
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X4, transform size 8X4
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 8X4
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 8X16
- {0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X16
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 16X8
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 16X8
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 16X32
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
- 0x0000000000000000ULL}, // block size 16X64, transform size 16X32
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 32X16
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 32X16
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X64, transform size 32X64
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 64X32
- // 4:1, 1:4 transform sizes.
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X16
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 16X4
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X32
- {0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 32X8
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X64, transform size 16X64
- {0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 64X16
-};
-
-const uint64_t kLeftMaskLookup[67][4] = {
- // transform size 4X4
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X4, transform size 4X4
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X8, transform size 4X4
- {0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X4, transform size 4X4
- {0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X8, transform size 4X4
- {0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 4X4
- {0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 4X4
- {0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 4X4
- {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 4X4
- {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
- 0x00ff00ff00ff00ffULL}, // block size 32X64, transform size 4X4
- {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 4X4
- {0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
- 0xffffffffffffffffULL}, // block size 64X64, transform size 4X4
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X4
- {0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 4X4
- {0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 4X4
- {0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 4X4
- {0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
- 0x000f000f000f000fULL}, // block size 16X64, transform size 4X4
- {0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 4X4
- // transform size 8X8
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X8, transform size 8X8
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 8X8
- {0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 8X8
- {0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 8X8
- {0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 8X8
- {0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 8X8
- {0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 8X8
- {0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
- 0x0055005500550055ULL}, // block size 32X64, transform size 8X8
- {0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 8X8
- {0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
- 0x5555555555555555ULL}, // block size 64X64, transform size 8X8
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X8
- {0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 8X8
- {0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
- 0x0005000500050005ULL}, // block size 16X64, transform size 8X8
- {0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 8X8
- // transform size 16X16
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X16, transform size 16X16
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 16X16
- {0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 16X16
- {0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 16X16
- {0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
- 0x0011001100110011ULL}, // block size 32X64, transform size 16X16
- {0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 16X16
- {0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
- 0x1111111111111111ULL}, // block size 64X64, transform size 16X16
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL}, // block size 16X64, transform size 16X16
- {0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 16X16
- // transform size 32X32
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X32, transform size 32X32
- {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
- 0x0101010101010101ULL}, // block size 32X64, transform size 32X32
- {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 32X32
- {0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
- 0x0101010101010101ULL}, // block size 64X64, transform size 32X32
- // transform size 64X64
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL}, // block size 64X64, transform size 64X64
- // 2:1, 1:2 transform sizes.
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X8, transform size 4X8
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X8
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X4, transform size 8X4
- {0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 8X4
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X16, transform size 8X16
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X16
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X8, transform size 16X8
- {0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 16X8
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X32, transform size 16X32
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL}, // block size 16X64, transform size 16X32
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X16, transform size 32X16
- {0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 32X16
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL}, // block size 32X64, transform size 32X64
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X32, transform size 64X32
- // 4:1, 1:4 transform sizes.
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 4X16, transform size 4X16
- {0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 16X4, transform size 16X4
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 8X32, transform size 8X32
- {0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 32X8, transform size 32X8
- {0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
- 0x0001000100010001ULL}, // block size 16X64, transform size 16X64
- {0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
- 0x0000000000000000ULL}, // block size 64X16, transform size 64X16
-};
-
} // namespace libgav1
diff --git a/libgav1/src/utils/constants.h b/libgav1/src/utils/constants.h
index ce987b4..a2076c5 100644
--- a/libgav1/src/utils/constants.h
+++ b/libgav1/src/utils/constants.h
@@ -44,6 +44,8 @@
kMinQuantizer = 0,
kMinLossyQuantizer = 1,
kMaxQuantizer = 255,
+ // Quantizer matrix is used only when level < 15.
+ kNumQuantizerLevelsForQuantizerMatrix = 15,
kFrameLfCount = 4,
kMaxLoopFilterValue = 63,
kNum4x4In64x64 = 256,
@@ -106,6 +108,7 @@
kMaxScaledSuperBlockSizeInPixels = 128 * 2,
kMaxSuperBlockSizeSquareInPixels = 128 * 128,
kNum4x4InLoopFilterUnit = 16,
+ kNum4x4InLoopRestorationUnit = 16,
kProjectionMvClamp = (1 << 14) - 1, // == 16383
kProjectionMvMaxHorizontalOffset = 8,
kCdefUnitSize = 64,
@@ -124,11 +127,12 @@
kSuperResScaleBits = 14,
kSuperResExtraBits = kSuperResScaleBits - kSuperResFilterBits,
kSuperResScaleMask = (1 << 14) - 1,
- kSuperResHorizontalBorder = 8,
+ kSuperResHorizontalBorder = 4,
kSuperResVerticalBorder = 1,
- // The SIMD implementations of superres calculate up to 4 extra upscaled
- // pixels which will over-read 2 downscaled pixels in the end of each row.
- kSuperResHorizontalPadding = 2,
+ // The SIMD implementations of superres calculate up to 15 extra upscaled
+ // pixels which will over-read up to 15 downscaled pixels in the end of each
+ // row. Set the padding to 16 for alignment purposes.
+ kSuperResHorizontalPadding = 16,
// TODO(chengchen): consider merging these constants:
// kFilterBits, kWienerFilterBits, and kSgrProjPrecisionBits, which are all 7,
// They are designed to match AV1 convolution, which increases coeff
@@ -625,6 +629,52 @@
abort();
}
+inline const char* ToString(const TransformSize size) {
+ switch (size) {
+ case kTransformSize4x4:
+ return "kTransformSize4x4";
+ case kTransformSize4x8:
+ return "kTransformSize4x8";
+ case kTransformSize4x16:
+ return "kTransformSize4x16";
+ case kTransformSize8x4:
+ return "kTransformSize8x4";
+ case kTransformSize8x8:
+ return "kTransformSize8x8";
+ case kTransformSize8x16:
+ return "kTransformSize8x16";
+ case kTransformSize8x32:
+ return "kTransformSize8x32";
+ case kTransformSize16x4:
+ return "kTransformSize16x4";
+ case kTransformSize16x8:
+ return "kTransformSize16x8";
+ case kTransformSize16x16:
+ return "kTransformSize16x16";
+ case kTransformSize16x32:
+ return "kTransformSize16x32";
+ case kTransformSize16x64:
+ return "kTransformSize16x64";
+ case kTransformSize32x8:
+ return "kTransformSize32x8";
+ case kTransformSize32x16:
+ return "kTransformSize32x16";
+ case kTransformSize32x32:
+ return "kTransformSize32x32";
+ case kTransformSize32x64:
+ return "kTransformSize32x64";
+ case kTransformSize64x16:
+ return "kTransformSize64x16";
+ case kTransformSize64x32:
+ return "kTransformSize64x32";
+ case kTransformSize64x64:
+ return "kTransformSize64x64";
+ case kNumTransformSizes:
+ return "kNumTransformSizes";
+ }
+ abort();
+}
+
inline const char* ToString(const TransformType type) {
switch (type) {
case kTransformTypeDctDct:
@@ -735,14 +785,6 @@
extern const uint8_t kDeblockFilterLevelIndex[kMaxPlanes][kNumLoopFilterTypes];
-extern const int8_t kMaskIdLookup[4][kMaxBlockSizes];
-
-extern const int8_t kVerticalBorderMaskIdLookup[kMaxBlockSizes];
-
-extern const uint64_t kTopMaskLookup[67][4];
-
-extern const uint64_t kLeftMaskLookup[67][4];
-
} // namespace libgav1
#endif // LIBGAV1_SRC_UTILS_CONSTANTS_H_
diff --git a/libgav1/src/utils/cpu.cc b/libgav1/src/utils/cpu.cc
index a6b7057..b3c51da 100644
--- a/libgav1/src/utils/cpu.cc
+++ b/libgav1/src/utils/cpu.cc
@@ -39,7 +39,7 @@
__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(ecx));
return (static_cast<uint64_t>(edx) << 32) | eax;
}
-#else // _MSC_VER
+#else // _MSC_VER
void CpuId(int leaf, uint32_t info[4]) {
__cpuidex(reinterpret_cast<int*>(info), leaf, 0 /*ecx=subleaf*/);
}
diff --git a/libgav1/src/utils/cpu.h b/libgav1/src/utils/cpu.h
index d098f1d..aefc2df 100644
--- a/libgav1/src/utils/cpu.h
+++ b/libgav1/src/utils/cpu.h
@@ -21,19 +21,58 @@
namespace libgav1 {
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(__i386__) || defined(__x86_64__)
+#define LIBGAV1_X86
+#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+#define LIBGAV1_X86
#define LIBGAV1_X86_MSVC
#endif
-#if !defined(LIBGAV1_ENABLE_SSE4_1)
-#if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
-#define LIBGAV1_ENABLE_SSE4_1 1
-#else
-#define LIBGAV1_ENABLE_SSE4_1 0
-#endif
-#endif // !defined(LIBGAV1_ENABLE_SSE4_1)
+#if defined(LIBGAV1_X86)
-#undef LIBGAV1_X86_MSVC
+#if !defined(LIBGAV1_ENABLE_SSE4_1)
+#define LIBGAV1_ENABLE_SSE4_1 1
+#endif
+
+#if LIBGAV1_ENABLE_SSE4_1
+#if !defined(LIBGAV1_ENABLE_AVX2)
+#define LIBGAV1_ENABLE_AVX2 1
+#endif // !defined(LIBGAV1_ENABLE_AVX2)
+#else // !LIBGAV1_ENABLE_SSE4_1
+// Disable AVX2 when SSE4.1 is disabled as it may rely on shared components.
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#endif // LIBGAV1_ENABLE_SSE4_1
+
+#else // !LIBGAV1_X86
+
+#undef LIBGAV1_ENABLE_AVX2
+#define LIBGAV1_ENABLE_AVX2 0
+#undef LIBGAV1_ENABLE_SSE4_1
+#define LIBGAV1_ENABLE_SSE4_1 0
+
+#endif // LIBGAV1_X86
+
+// For x86 LIBGAV1_TARGETING_* indicate the source being built is targeting
+// (at least) that instruction set. This prevents disabling other instruction
+// sets if the current instruction set isn't a global target, e.g., building
+// *_avx2.cc w/-mavx2, but the remaining files without the flag.
+#if LIBGAV1_ENABLE_AVX2 && defined(__AVX2__)
+#define LIBGAV1_TARGETING_AVX2 1
+#else
+#define LIBGAV1_TARGETING_AVX2 0
+#endif
+
+// Note: LIBGAV1_X86_MSVC isn't completely correct for Visual Studio, but there
+// is no equivalent to __SSE4_1__. LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS will be
+// enabled in dsp.h to compensate for this.
+#if LIBGAV1_ENABLE_SSE4_1 && (defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC))
+#define LIBGAV1_TARGETING_SSE4_1 1
+#else
+#define LIBGAV1_TARGETING_SSE4_1 0
+#endif
+
+#undef LIBGAV1_X86
#if !defined(LIBGAV1_ENABLE_NEON)
// TODO(jzern): add support for _M_ARM64.
diff --git a/libgav1/src/utils/dynamic_buffer.h b/libgav1/src/utils/dynamic_buffer.h
index 5e2f644..40ece26 100644
--- a/libgav1/src/utils/dynamic_buffer.h
+++ b/libgav1/src/utils/dynamic_buffer.h
@@ -28,6 +28,7 @@
class DynamicBuffer {
public:
T* get() { return buffer_.get(); }
+ const T* get() const { return buffer_.get(); }
// Resizes the buffer so that it can hold at least |size| elements. Existing
// contents will be destroyed when resizing to a larger size.
@@ -45,6 +46,8 @@
return true;
}
+ size_t size() const { return size_; }
+
private:
std::unique_ptr<T[]> buffer_;
size_t size_ = 0;
diff --git a/libgav1/src/utils/entropy_decoder.cc b/libgav1/src/utils/entropy_decoder.cc
index dfe3bba..bf21199 100644
--- a/libgav1/src/utils/entropy_decoder.cc
+++ b/libgav1/src/utils/entropy_decoder.cc
@@ -20,6 +20,7 @@
#include "src/utils/common.h"
#include "src/utils/compiler_attributes.h"
#include "src/utils/constants.h"
+#include "src/utils/cpu.h"
#if defined(__ARM_NEON__) || defined(__aarch64__) || \
(defined(_MSC_VER) && defined(_M_ARM))
@@ -32,24 +33,20 @@
#include <arm_neon.h>
#endif
-#if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
-#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 1
+#if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
#else
-#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 0
+#define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
#endif
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
-#include <smmintrin.h>
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+#include <emmintrin.h>
#endif
namespace libgav1 {
namespace {
constexpr uint32_t kReadBitMask = ~255;
-// This constant is used to set the value of |bits_| so that bits can be read
-// after end of stream without trying to refill the buffer for a reasonably long
-// time.
-constexpr int kLargeBitCount = 0x4000;
constexpr int kCdfPrecision = 6;
constexpr int kMinimumProbabilityPerSymbol = 4;
@@ -78,10 +75,12 @@
// count >> 4 is 2 for count == 31.
// Now, the equation becomes:
// 4 + (count >> 4) + (symbol_count > 3).
- // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
- // with bitwise or. So the final equation is:
- // (4 | (count >> 4)) + (symbol_count > 3).
- const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count > 3);
+ // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
+ // with bitwise or:
+ // (4 | (count >> 4)) + (symbol_count > 3).
+ // but using addition will allow the compiler to eliminate an operation when
+ // symbol_count is known and this function is inlined.
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
// Hints for further optimizations:
//
// 1. clang can vectorize this for loop with width 4, even though the loop
@@ -103,13 +102,15 @@
// signed integer and right-shifted. This requires the right shift of a
// signed integer be an arithmetic shift, which is true for clang, gcc, and
// Visual C++.
- for (int i = 0; i < symbol_count - 1; ++i) {
+ assert(symbol_count - 1 > 0);
+ int i = 0;
+ do {
if (i < symbol) {
cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
} else {
cdf[i] -= cdf[i] >> rate;
}
- }
+ } while (++i < symbol_count - 1);
cdf[symbol_count] += static_cast<uint16_t>(count < 32);
}
@@ -146,8 +147,9 @@
// cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
// }
//
-// The following ARM NEON implementations use the second form, which seems
-// slightly faster.
+// The following ARM NEON implementations use a modified version of the first
+// form, using the comparison mask and unsigned rollover to avoid the need to
+// calculate rounding.
//
// The cdf array has symbol_count + 1 elements. The first symbol_count elements
// are the CDF. The last element is a count that is initialized to 0 and may
@@ -169,42 +171,47 @@
void UpdateCdf5(uint16_t* const cdf, const int symbol) {
uint16x4_t cdf_vec = vld1_u16(cdf);
const uint16_t count = cdf[5];
- const int rate = (4 | (count >> 4)) + 1;
- const uint16x4_t zero = vdup_n_u16(0);
- const uint16x4_t cdf_max_probability =
- vdup_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+ const int rate = (count >> 4) + 5;
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
const uint16x4_t index = vcreate_u16(0x0003000200010000);
const uint16x4_t symbol_vec = vdup_n_u16(symbol);
- const uint16x4_t mask = vclt_u16(index, symbol_vec);
- const uint16x4_t a = vbsl_u16(mask, cdf_max_probability, zero);
- const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(cdf_vec, a));
+ const uint16x4_t mask = vcge_u16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
const int16x4_t negative_rate = vdup_n_s16(-rate);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
- cdf_vec = vsub_u16(cdf_vec, delta);
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = vadd_u16(cdf_offset, delta);
vst1_u16(cdf, cdf_vec);
cdf[5] = count + static_cast<uint16_t>(count < 32);
}
// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
template <int symbol_count>
void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
static_assert(symbol_count >= 7 && symbol_count <= 9, "");
uint16x8_t cdf_vec = vld1q_u16(cdf);
const uint16_t count = cdf[symbol_count];
- const int rate = (4 | (count >> 4)) + 1;
- const uint16x8_t zero = vdupq_n_u16(0);
- const uint16x8_t cdf_max_probability =
- vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
vcreate_u16(0x0007000600050004));
const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
- const uint16x8_t mask = vcltq_u16(index, symbol_vec);
- const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
- const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
const int16x8_t negative_rate = vdupq_n_s16(-rate);
const uint16x8_t delta =
vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec = vsubq_u16(cdf_vec, delta);
+ cdf_vec = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf, cdf_vec);
cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
}
@@ -217,27 +224,31 @@
UpdateCdf7To9<8>(cdf, symbol);
}
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
void UpdateCdf11(uint16_t* const cdf, const int symbol) {
uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
const uint16_t count = cdf[11];
cdf[11] = count + static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
+ const int rate = (count >> 4) + 5;
if (symbol > 1) {
cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
- const uint16x8_t zero = vdupq_n_u16(0);
- const uint16x8_t cdf_max_probability =
- vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
const int16x8_t negative_rate = vdupq_n_s16(-rate);
const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
vcreate_u16(0x0009000800070006));
- const uint16x8_t mask = vcltq_u16(index, symbol_vec);
- const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
- const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+ const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
const uint16x8_t delta =
vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec = vsubq_u16(cdf_vec, delta);
+ cdf_vec = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf + 2, cdf_vec);
} else {
if (symbol != 0) {
@@ -254,65 +265,67 @@
}
}
+// See UpdateCdf5 for implementation details.
void UpdateCdf13(uint16_t* const cdf, const int symbol) {
uint16x8_t cdf_vec0 = vld1q_u16(cdf);
uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
const uint16_t count = cdf[13];
- const int rate = (4 | (count >> 4)) + 1;
- const uint16x8_t zero = vdupq_n_u16(0);
- const uint16x8_t cdf_max_probability =
- vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
const int16x8_t negative_rate = vdupq_n_s16(-rate);
uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
vcreate_u16(0x0007000600050004));
- uint16x8_t mask = vcltq_u16(index, symbol_vec);
- uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
- int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec0, a));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec0 = vsubq_u16(cdf_vec0, delta);
+ cdf_vec0 = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf, cdf_vec0);
index = vcombine_u16(vcreate_u16(0x0007000600050004),
vcreate_u16(0x000b000a00090008));
- mask = vcltq_u16(index, symbol_vec);
- a = vbslq_u16(mask, cdf_max_probability, zero);
- diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec1, a));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
+ cdf_offset = vsubq_u16(cdf_vec1, mask);
delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec1 = vsubq_u16(cdf_vec1, delta);
+ cdf_vec1 = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf + 4, cdf_vec1);
cdf[13] = count + static_cast<uint16_t>(count < 32);
}
+// See UpdateCdf5 for implementation details.
void UpdateCdf16(uint16_t* const cdf, const int symbol) {
uint16x8_t cdf_vec = vld1q_u16(cdf);
const uint16_t count = cdf[16];
- const int rate = (4 | (count >> 4)) + 1;
- const uint16x8_t zero = vdupq_n_u16(0);
- const uint16x8_t cdf_max_probability =
- vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
+ const int rate = (count >> 4) + 5;
+ const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
const int16x8_t negative_rate = vdupq_n_s16(-rate);
uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
vcreate_u16(0x0007000600050004));
- uint16x8_t mask = vcltq_u16(index, symbol_vec);
- uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
- int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+ uint16x8_t mask = vcgeq_u16(index, symbol_vec);
+ uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
+ int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec = vsubq_u16(cdf_vec, delta);
+ cdf_vec = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf, cdf_vec);
cdf_vec = vld1q_u16(cdf + 8);
index = vcombine_u16(vcreate_u16(0x000b000a00090008),
vcreate_u16(0x000f000e000d000c));
- mask = vcltq_u16(index, symbol_vec);
- a = vbslq_u16(mask, cdf_max_probability, zero);
- diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
+ mask = vcgeq_u16(index, symbol_vec);
+ a = vorrq_u16(mask, cdf_max_probability);
+ diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
+ cdf_offset = vsubq_u16(cdf_vec, mask);
delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
- cdf_vec = vsubq_u16(cdf_vec, delta);
+ cdf_vec = vaddq_u16(cdf_offset, delta);
vst1q_u16(cdf + 8, cdf_vec);
cdf[16] = count + static_cast<uint16_t>(count < 32);
@@ -320,7 +333,7 @@
#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
inline __m128i LoadLo8(const void* a) {
return _mm_loadl_epi64(static_cast<const __m128i*>(a));
@@ -341,39 +354,47 @@
void UpdateCdf5(uint16_t* const cdf, const int symbol) {
__m128i cdf_vec = LoadLo8(cdf);
const uint16_t count = cdf[5];
- const int rate = (4 | (count >> 4)) + 1;
- const __m128i zero = _mm_setzero_si128();
- const __m128i cdf_max_probability = _mm_shufflelo_epi16(
- _mm_cvtsi32_si128(kCdfMaxProbability + 1 - (1 << rate)), 0);
- const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00030002, 0x00010000);
+ const int rate = (count >> 4) + 5;
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
- const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
- const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
- const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+ // i >= symbol.
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ // i < symbol: 32768, i >= symbol: 65535.
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
+ // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
+ // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
StoreLo8(cdf, cdf_vec);
cdf[5] = count + static_cast<uint16_t>(count < 32);
}
// This version works for |symbol_count| = 7, 8, or 9.
+// See UpdateCdf5 for implementation details.
template <int symbol_count>
void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
static_assert(symbol_count >= 7 && symbol_count <= 9, "");
__m128i cdf_vec = LoadUnaligned16(cdf);
const uint16_t count = cdf[symbol_count];
- const int rate = (4 | (count >> 4)) + 1;
- const __m128i zero = _mm_setzero_si128();
+ const int rate = (count >> 4) + 5;
const __m128i cdf_max_probability =
- _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
const __m128i index =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i symbol_vec = _mm_set1_epi16(symbol);
- const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
- const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
- const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
StoreUnaligned16(cdf, cdf_vec);
cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
}
@@ -386,25 +407,30 @@
UpdateCdf7To9<8>(cdf, symbol);
}
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf7To9<9>(cdf, symbol);
+}
+
+// See UpdateCdf5 for implementation details.
void UpdateCdf11(uint16_t* const cdf, const int symbol) {
__m128i cdf_vec = LoadUnaligned16(cdf + 2);
const uint16_t count = cdf[11];
cdf[11] = count + static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
+ const int rate = (count >> 4) + 5;
if (symbol > 1) {
cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
- const __m128i zero = _mm_setzero_si128();
const __m128i cdf_max_probability =
- _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
const __m128i index =
- _mm_set_epi32(0x00090008, 0x00070006, 0x00050004, 0x00030002);
- const __m128i symbol_vec = _mm_set1_epi16(symbol);
- const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
- const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
- const __m128i diff = _mm_sub_epi16(cdf_vec, a);
+ _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ cdf_vec = _mm_add_epi16(cdf_offset, delta);
StoreUnaligned16(cdf + 2, cdf_vec);
} else {
if (symbol != 0) {
@@ -420,32 +446,33 @@
}
}
+// See UpdateCdf5 for implementation details.
void UpdateCdf13(uint16_t* const cdf, const int symbol) {
- __m128i cdf_vec0 = LoadUnaligned16(cdf);
+ __m128i cdf_vec0 = LoadLo8(cdf);
__m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
const uint16_t count = cdf[13];
- const int rate = (4 | (count >> 4)) + 1;
- const __m128i zero = _mm_setzero_si128();
+ const int rate = (count >> 4) + 5;
const __m128i cdf_max_probability =
- _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
- const __m128i symbol_vec = _mm_set1_epi16(symbol);
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
- const __m128i index =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
- const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
- const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
+ const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
- StoreUnaligned16(cdf, cdf_vec0);
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
+ StoreLo8(cdf, cdf_vec0);
const __m128i index1 =
- _mm_set_epi32(0x000b000a, 0x00090008, 0x00070006, 0x00050004);
- const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
- const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
- const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
+ _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
- cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
StoreUnaligned16(cdf + 4, cdf_vec1);
cdf[13] = count + static_cast<uint16_t>(count < 32);
@@ -454,35 +481,36 @@
void UpdateCdf16(uint16_t* const cdf, const int symbol) {
__m128i cdf_vec0 = LoadUnaligned16(cdf);
const uint16_t count = cdf[16];
- const int rate = (4 | (count >> 4)) + 1;
- const __m128i zero = _mm_setzero_si128();
+ const int rate = (count >> 4) + 5;
const __m128i cdf_max_probability =
- _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
- const __m128i symbol_vec = _mm_set1_epi16(symbol);
+ _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
+ const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
const __m128i index =
- _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
- const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
- const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
- const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
+ _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
+ const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
+ const __m128i a = _mm_or_si128(mask, cdf_max_probability);
+ const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
+ const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
+ cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
StoreUnaligned16(cdf, cdf_vec0);
__m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
const __m128i index1 =
- _mm_set_epi32(0x000f000e, 0x000d000c, 0x000b000a, 0x00090008);
- const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
- const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
- const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
+ _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
+ const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
+ const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
+ const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
+ const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
- cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
+ cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
StoreUnaligned16(cdf + 8, cdf_vec1);
cdf[16] = count + static_cast<uint16_t>(count < 32);
}
-#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
void UpdateCdf5(uint16_t* const cdf, const int symbol) {
UpdateCdf(cdf, 5, symbol);
@@ -496,6 +524,10 @@
UpdateCdf(cdf, 8, symbol);
}
+void UpdateCdf9(uint16_t* const cdf, const int symbol) {
+ UpdateCdf(cdf, 9, symbol);
+}
+
void UpdateCdf11(uint16_t* const cdf, const int symbol) {
UpdateCdf(cdf, 11, symbol);
}
@@ -508,9 +540,28 @@
UpdateCdf(cdf, 16, symbol);
}
-#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
+#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
#endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+inline DaalaBitReader::WindowSize HostToBigEndian(
+ const DaalaBitReader::WindowSize x) {
+ static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
+#if defined(__GNUC__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
+#else
+ return x;
+#endif
+#elif defined(_WIN32)
+ // Note Windows targets are assumed to be little endian.
+ return static_cast<DaalaBitReader::WindowSize>(
+ (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
+ : _byteswap_ulong(static_cast<unsigned long>(x)));
+#else
+#error Unknown compiler!
+#endif // defined(__GNUC__)
+}
+
} // namespace
#if !LIBGAV1_CXX17
@@ -520,11 +571,33 @@
DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
bool allow_update_cdf)
: data_(data),
- size_(size),
- data_index_(0),
- allow_update_cdf_(allow_update_cdf) {
- window_diff_ = (WindowSize{1} << (kWindowSize - 1)) - 1;
- values_in_range_ = kCdfMaxProbability;
+ data_end_(data + size),
+ data_memcpy_end_((size >= sizeof(WindowSize))
+ ? data + size - sizeof(WindowSize) + 1
+ : data),
+ allow_update_cdf_(allow_update_cdf),
+ values_in_range_(kCdfMaxProbability) {
+ if (data_ < data_memcpy_end_) {
+ // This is a simplified version of PopulateBits() which loads 8 extra bits
+ // and skips the unnecessary shifts of value and window_diff_.
+ WindowSize value;
+ memcpy(&value, data_, sizeof(value));
+ data_ += sizeof(value);
+ window_diff_ = HostToBigEndian(value) ^ -1;
+ // Note the initial value of bits_ is larger than kMaxCachedBits as it's
+ // used to restore the most significant 0 bit that would be present after
+ // PopulateBits() when we extract the first symbol value.
+ // As shown in Section 8.2.2 Initialization process for symbol decoder,
+ // which uses a fixed offset to read the symbol values, the most
+ // significant bit is always 0:
+ // The variable numBits is set equal to Min( sz * 8, 15).
+ // The variable buf is read using the f(numBits) parsing process.
+ // The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
+ // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
+ bits_ = kWindowSize - 15;
+ return;
+ }
+ window_diff_ = 0;
bits_ = -15;
PopulateBits();
}
@@ -537,12 +610,11 @@
int DaalaBitReader::ReadBit() {
const uint32_t curr =
((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
- const WindowSize zero_threshold = static_cast<WindowSize>(curr)
- << (kWindowSize - 16);
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
int bit = 1;
- if (window_diff_ >= zero_threshold) {
+ if (symbol_value >= curr) {
values_in_range_ -= curr;
- window_diff_ -= zero_threshold;
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
bit = 0;
} else {
values_in_range_ = curr;
@@ -580,7 +652,8 @@
}
bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
- const bool symbol = ReadSymbolImpl(cdf) != 0;
+ assert(cdf[1] == 0);
+ const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
if (allow_update_cdf_) {
const uint16_t count = cdf[2];
// rate is computed in the spec as:
@@ -608,15 +681,15 @@
return symbol;
}
-bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t* cdf) {
+bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
return ReadSymbolImpl(cdf) != 0;
}
template <int symbol_count>
int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
static_assert(symbol_count >= 3 && symbol_count <= 16, "");
- if (symbol_count == 4) {
- return ReadSymbol4(cdf);
+ if (symbol_count == 3 || symbol_count == 4) {
+ return ReadSymbol3Or4(cdf, symbol_count);
}
int symbol;
if (symbol_count == 8) {
@@ -633,6 +706,8 @@
UpdateCdf7(cdf, symbol);
} else if (symbol_count == 8) {
UpdateCdf8(cdf, symbol);
+ } else if (symbol_count == 9) {
+ UpdateCdf9(cdf, symbol);
} else if (symbol_count == 11) {
UpdateCdf11(cdf, symbol);
} else if (symbol_count == 13) {
@@ -653,8 +728,7 @@
uint32_t curr = values_in_range_;
int symbol = -1;
uint32_t prev;
- const auto symbol_value =
- static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
// Search through the |cdf| array to determine where the scaled cdf value and
// |symbol_value| cross over.
@@ -665,7 +739,7 @@
delta -= kMinimumProbabilityPerSymbol;
} while (symbol_value < curr);
values_in_range_ = prev - curr;
- window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
NormalizeRange();
return symbol;
}
@@ -675,8 +749,7 @@
assert(cdf[symbol_count - 1] == 0);
assert(symbol_count > 1 && symbol_count <= 16);
--symbol_count;
- const auto symbol_value =
- static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
// Search through the |cdf| array to determine where the scaled cdf value and
// |symbol_value| cross over. Since the CDFs are sorted, we can use binary
// search to do this. Let |symbol| be the index of the first |cdf| array
@@ -709,36 +782,36 @@
assert(low == high + 1);
// At this point, |low| is the symbol that has been decoded.
values_in_range_ = prev - curr;
- window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
NormalizeRange();
return low;
}
-int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf) {
- assert(cdf[1] == 0);
- const auto symbol_value =
- static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
- const uint32_t curr = ScaleCdf(values_in_range_ >> 8, cdf, 0, 1);
+int DaalaBitReader::ReadSymbolImpl(uint16_t cdf) {
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ const uint32_t curr =
+ (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
+ kMinimumProbabilityPerSymbol;
const int symbol = static_cast<int>(symbol_value < curr);
if (symbol == 1) {
values_in_range_ = curr;
} else {
values_in_range_ -= curr;
- window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
}
NormalizeRange();
return symbol;
}
-// Equivalent to ReadSymbol(cdf, 4), with the ReadSymbolImpl and UpdateCdf
+// Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
// calls inlined.
-int DaalaBitReader::ReadSymbol4(uint16_t* const cdf) {
- assert(cdf[3] == 0);
+int DaalaBitReader::ReadSymbol3Or4(uint16_t* const cdf,
+ const int symbol_count) {
+ assert(cdf[symbol_count - 1] == 0);
uint32_t curr = values_in_range_;
uint32_t prev;
- const auto symbol_value =
- static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
- uint32_t delta = kMinimumProbabilityPerSymbol * 3;
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
+ uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
const uint32_t values_in_range_shifted = values_in_range_ >> 8;
// Search through the |cdf| array to determine where the scaled cdf value and
@@ -756,11 +829,11 @@
// delta -= kMinimumProbabilityPerSymbol;
// } while (symbol_value < curr);
// if (allow_update_cdf_) {
- // UpdateCdf(cdf, 4, symbol);
+ // UpdateCdf(cdf, [3,4], symbol);
// }
//
- // The do-while loop is unrolled with four iterations, and the UpdateCdf call
- // is inlined and merged into the four iterations.
+ // The do-while loop is unrolled with three or four iterations, and the
+ // UpdateCdf call is inlined and merged into the iterations.
int symbol = 0;
// Iteration 0.
prev = curr;
@@ -769,31 +842,36 @@
if (symbol_value >= curr) {
// symbol == 0.
if (allow_update_cdf_) {
- // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/0).
- const uint16_t count = cdf[4];
- cdf[4] += static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
- // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
- // NEON code is slower. Consider using the C version if __arm__ is
- // defined.
- // 2. The ARM NEON code (compiled for arm64) is slightly slower on
- // Samsung Galaxy S8+ (SM-G955FD).
- uint16x4_t cdf_vec = vld1_u16(cdf);
- const int16x4_t negative_rate = vdup_n_s16(-rate);
- const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
- cdf_vec = vsub_u16(cdf_vec, delta);
- vst1_u16(cdf, cdf_vec);
-#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
- __m128i cdf_vec = LoadLo8(cdf);
- const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
- cdf_vec = _mm_sub_epi16(cdf_vec, delta);
- StoreLo8(cdf, cdf_vec);
-#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
- cdf[0] -= cdf[0] >> rate;
- cdf[1] -= cdf[1] >> rate;
- cdf[2] -= cdf[2] >> rate;
+ // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
+ // NEON code is slower. Consider using the C version if __arm__ is
+ // defined.
+ // 2. The ARM NEON code (compiled for arm64) is slightly slower on
+ // Samsung Galaxy S8+ (SM-G955FD).
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
+ cdf_vec = vsub_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_sub_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ cdf[2] -= cdf[2] >> rate;
#endif
+ } else { // symbol_count == 3.
+ cdf[0] -= cdf[0] >> rate;
+ cdf[1] -= cdf[1] >> rate;
+ }
}
goto found;
}
@@ -806,81 +884,88 @@
if (symbol_value >= curr) {
// symbol == 1.
if (allow_update_cdf_) {
- // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/1).
- const uint16_t count = cdf[4];
- cdf[4] += static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
cdf[1] -= cdf[1] >> rate;
- cdf[2] -= cdf[2] >> rate;
+ if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
}
goto found;
}
++symbol;
- delta -= kMinimumProbabilityPerSymbol;
- // Iteration 2.
+ if (symbol_count == 4) {
+ delta -= kMinimumProbabilityPerSymbol;
+ // Iteration 2.
+ prev = curr;
+ curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
+ delta;
+ if (symbol_value >= curr) {
+ // symbol == 2.
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
+ const uint16_t count = cdf[4];
+ cdf[4] += static_cast<uint16_t>(count < 32);
+ const int rate = (count >> 4) + 5;
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ cdf[2] -= cdf[2] >> rate;
+ }
+ goto found;
+ }
+ ++symbol;
+ }
+ // |delta| is 0 for the last iteration.
+ // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
prev = curr;
- curr =
- ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
- if (symbol_value >= curr) {
- // symbol == 2.
- if (allow_update_cdf_) {
- // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
- const uint16_t count = cdf[4];
- cdf[4] += static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
+ // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
+ curr = 0;
+ // symbol == [2,3].
+ if (allow_update_cdf_) {
+ // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
+ const uint16_t count = cdf[symbol_count];
+ cdf[symbol_count] += static_cast<uint16_t>(count < 32);
+ const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
+ if (symbol_count == 4) {
+#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
+ // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
+ // code is a tiny bit slower. Consider using the C version if __arm__ is
+ // defined.
+ uint16x4_t cdf_vec = vld1_u16(cdf);
+ const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
+ const int16x4_t diff =
+ vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
+ const int16x4_t negative_rate = vdup_n_s16(-rate);
+ const uint16x4_t delta =
+ vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
+ cdf_vec = vadd_u16(cdf_vec, delta);
+ vst1_u16(cdf, cdf_vec);
+ cdf[3] = 0;
+#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
+ __m128i cdf_vec = LoadLo8(cdf);
+ const __m128i cdf_max_probability =
+ _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
+ const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
+ const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
+ cdf_vec = _mm_add_epi16(cdf_vec, delta);
+ StoreLo8(cdf, cdf_vec);
+ cdf[3] = 0;
+#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
- cdf[2] -= cdf[2] >> rate;
- }
- goto found;
- }
- ++symbol;
- // |delta| is 0 for the last iteration.
- // Iteration 3.
- prev = curr;
- // Since cdf[3] is 0 and |delta| is 0, |curr| is also 0.
- curr = 0;
- // symbol == 3.
- if (allow_update_cdf_) {
- // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/3).
- const uint16_t count = cdf[4];
- cdf[4] += static_cast<uint16_t>(count < 32);
- const int rate = (4 | (count >> 4)) + 1;
-#if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
- // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
- // code is a tiny bit slower. Consider using the C version if __arm__ is
- // defined.
- uint16x4_t cdf_vec = vld1_u16(cdf);
- const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
- const int16x4_t diff =
- vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
- const int16x4_t negative_rate = vdup_n_s16(-rate);
- const uint16x4_t delta =
- vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
- cdf_vec = vadd_u16(cdf_vec, delta);
- vst1_u16(cdf, cdf_vec);
- cdf[3] = 0;
-#elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
- __m128i cdf_vec = LoadLo8(cdf);
- const __m128i cdf_max_probability =
- _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
- const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
- const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
- cdf_vec = _mm_add_epi16(cdf_vec, delta);
- StoreLo8(cdf, cdf_vec);
- cdf[3] = 0;
-#else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
- cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
- cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
- cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
+ cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
#endif
+ } else { // symbol_count == 3.
+ cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
+ cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
+ }
}
found:
// End of unrolled do-while loop.
values_in_range_ = prev - curr;
- window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
NormalizeRange();
return symbol;
}
@@ -889,8 +974,7 @@
assert(cdf[7] == 0);
uint32_t curr = values_in_range_;
uint32_t prev;
- const auto symbol_value =
- static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
+ const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
uint32_t delta = kMinimumProbabilityPerSymbol * 7;
// Search through the |cdf| array to determine where the scaled cdf value and
// |symbol_value| cross over.
@@ -944,12 +1028,13 @@
// End of unrolled do-while loop.
values_in_range_ = prev - curr;
- window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
+ window_diff_ -= static_cast<WindowSize>(curr) << bits_;
NormalizeRange();
return symbol;
}
void DaalaBitReader::PopulateBits() {
+ constexpr int kMaxCachedBits = kWindowSize - 16;
#if defined(__aarch64__)
// Fast path: read eight bytes and add the first six bytes to window_diff_.
// This fast path makes the following assumptions.
@@ -962,27 +1047,25 @@
// performance (measured on Lenovo ThinkStation P920 running Linux). (The
// reason is still unknown.) Therefore this fast path is only used on arm64.
static_assert(kWindowSize == 64, "");
- if (size_ - data_index_ >= 8) {
+ if (data_ < data_memcpy_end_) {
uint64_t value;
// arm64 supports unaligned loads, so this memcpy call is compiled to a
// single ldr instruction.
- memcpy(&value, &data_[data_index_], 8);
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- value = __builtin_bswap64(value);
-#endif
- value &= 0xffffffffffff0000;
- window_diff_ ^= static_cast<WindowSize>(value) >> (bits_ + 16);
- data_index_ += 6;
- bits_ += 6 * 8;
+ memcpy(&value, data_, sizeof(value));
+ data_ += kMaxCachedBits >> 3;
+ value = HostToBigEndian(value) ^ -1;
+ value >>= kWindowSize - kMaxCachedBits;
+ window_diff_ = value | (window_diff_ << kMaxCachedBits);
+ bits_ += kMaxCachedBits;
return;
}
#endif
- size_t data_index = data_index_;
+ const uint8_t* data = data_;
int bits = bits_;
WindowSize window_diff = window_diff_;
- int shift = kWindowSize - 9 - (bits + 15);
+ int count = kWindowSize - 9 - (bits + 15);
// The fast path above, if compiled, would cause clang 8.0.7 to vectorize
// this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
// iterations when WindowSize is 64 bits. So it is not profitable to
@@ -992,23 +1075,26 @@
#ifdef __clang__
#pragma clang loop vectorize(disable) interleave(disable)
#endif
- for (; shift >= 0 && data_index < size_; shift -= 8) {
- window_diff ^= static_cast<WindowSize>(data_[data_index++]) << shift;
+ for (; count >= 0 && data < data_end_; count -= 8) {
+ const uint8_t value = *data++ ^ -1;
+ window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
bits += 8;
}
- if (data_index >= size_) {
- bits = kLargeBitCount;
+ assert(bits <= kMaxCachedBits);
+ if (data == data_end_) {
+ // Shift in some 1s. This is equivalent to providing fake 0 data bits.
+ window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
+ bits = kMaxCachedBits;
}
- data_index_ = data_index;
+ data_ = data;
bits_ = bits;
window_diff_ = window_diff;
}
void DaalaBitReader::NormalizeRange() {
- const int bits_used = 15 - FloorLog2(values_in_range_);
+ const int bits_used = 15 ^ FloorLog2(values_in_range_);
bits_ -= bits_used;
- window_diff_ = ((window_diff_ + 1) << bits_used) - 1;
values_in_range_ <<= bits_used;
if (bits_ < 0) PopulateBits();
}
@@ -1017,10 +1103,13 @@
template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
diff --git a/libgav1/src/utils/entropy_decoder.h b/libgav1/src/utils/entropy_decoder.h
index 75c633b..c066b98 100644
--- a/libgav1/src/utils/entropy_decoder.h
+++ b/libgav1/src/utils/entropy_decoder.h
@@ -27,6 +27,10 @@
class DaalaBitReader : public BitReader {
public:
+ // WindowSize must be an unsigned integer type with at least 32 bits. Use the
+ // largest type with fast arithmetic. size_t should meet these requirements.
+ using WindowSize = size_t;
+
DaalaBitReader(const uint8_t* data, size_t size, bool allow_update_cdf);
~DaalaBitReader() override = default;
@@ -42,7 +46,7 @@
// ReadSymbol() calls for which the |symbol_count| is equal to 2 (boolean
// symbols) will use this variant.
bool ReadSymbol(uint16_t* cdf);
- bool ReadSymbolWithoutCdfUpdate(uint16_t* cdf);
+ bool ReadSymbolWithoutCdfUpdate(uint16_t cdf);
// Use either linear search or binary search for decoding the symbol depending
// on |symbol_count|. ReadSymbol calls for which the |symbol_count| is known
// at compile time will use this variant.
@@ -50,10 +54,6 @@
int ReadSymbol(uint16_t* cdf);
private:
- // WindowSize must be an unsigned integer type with at least 32 bits. Use the
- // largest type with fast arithmetic. size_t should meet these requirements.
- static_assert(sizeof(size_t) == sizeof(void*), "");
- using WindowSize = size_t;
static constexpr int kWindowSize = static_cast<int>(sizeof(WindowSize)) * 8;
static_assert(kWindowSize >= 32, "");
@@ -72,9 +72,9 @@
inline int ReadSymbolImplBinarySearch(const uint16_t* cdf, int symbol_count);
// Specialized implementation of ReadSymbolImpl based on the fact that
// symbol_count == 2.
- inline int ReadSymbolImpl(const uint16_t* cdf);
+ inline int ReadSymbolImpl(uint16_t cdf);
// ReadSymbolN is a specialization of ReadSymbol for symbol_count == N.
- LIBGAV1_ALWAYS_INLINE int ReadSymbol4(uint16_t* cdf);
+ LIBGAV1_ALWAYS_INLINE int ReadSymbol3Or4(uint16_t* cdf, int symbol_count);
// ReadSymbolImplN is a specialization of ReadSymbolImpl for
// symbol_count == N.
LIBGAV1_ALWAYS_INLINE int ReadSymbolImpl8(const uint16_t* cdf);
@@ -83,28 +83,37 @@
// calls PopulateBits() if necessary.
inline void NormalizeRange();
- const uint8_t* const data_;
- const size_t size_;
- size_t data_index_;
+ const uint8_t* data_;
+ const uint8_t* const data_end_;
+ // If |data_| < |data_memcpy_end_|, then we can read sizeof(WindowSize) bytes
+ // from |data_|. Note with sizeof(WindowSize) == 4 this is only used in the
+ // constructor, not PopulateBits().
+ const uint8_t* const data_memcpy_end_;
const bool allow_update_cdf_;
- // Number of bits of data in the current value.
+ // Number of cached bits of data in the current value.
int bits_;
// Number of values in the current range. Declared as uint32_t for better
// performance but only the lower 16 bits are used.
uint32_t values_in_range_;
// The difference between the high end of the current range and the coded
- // value minus 1. The 16 most significant bits of this variable is used to
+ // value minus 1. The 16 bits above |bits_| of this variable are used to
// decode the next symbol. It is filled in whenever |bits_| is less than 0.
+ // Note this implementation differs from the spec as it trades the need to
+ // shift in 1s in NormalizeRange() with an extra shift in PopulateBits(),
+ // which occurs less frequently.
WindowSize window_diff_;
};
extern template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<6>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<9>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
+extern template int DaalaBitReader::ReadSymbol<12>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
extern template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
diff --git a/libgav1/src/utils/libgav1_utils.cmake b/libgav1/src/utils/libgav1_utils.cmake
index 8b6ec4b..587ca5d 100644
--- a/libgav1/src/utils/libgav1_utils.cmake
+++ b/libgav1/src/utils/libgav1_utils.cmake
@@ -39,8 +39,6 @@
"${libgav1_source}/utils/logging.cc"
"${libgav1_source}/utils/logging.h"
"${libgav1_source}/utils/memory.h"
- "${libgav1_source}/utils/parameter_tree.cc"
- "${libgav1_source}/utils/parameter_tree.h"
"${libgav1_source}/utils/queue.h"
"${libgav1_source}/utils/raw_bit_reader.cc"
"${libgav1_source}/utils/raw_bit_reader.h"
diff --git a/libgav1/src/utils/logging.cc b/libgav1/src/utils/logging.cc
index 9a43c22..26e3e15 100644
--- a/libgav1/src/utils/logging.cc
+++ b/libgav1/src/utils/logging.cc
@@ -56,7 +56,7 @@
va_end(ap);
fprintf(stderr, "\n");
}
-#else // !LIBGAV1_ENABLE_LOGGING
+#else // !LIBGAV1_ENABLE_LOGGING
void Log(LogSeverity /*severity*/, const char* /*file*/, int /*line*/,
const char* /*format*/, ...) {}
#endif // LIBGAV1_ENABLE_LOGGING
diff --git a/libgav1/src/utils/logging.h b/libgav1/src/utils/logging.h
index 48928db..473aebd 100644
--- a/libgav1/src/utils/logging.h
+++ b/libgav1/src/utils/logging.h
@@ -35,13 +35,13 @@
// setting LIBGAV1_ENABLE_LOGGING.
// Severity is given as an all-caps version of enum LogSeverity with the
// leading 'k' removed: LIBGAV1_DLOG(INFO, "...");
-#define LIBGAV1_DLOG(severity, ...) \
- do { \
- constexpr const char* libgav1_logging_internal_basename = \
- ::libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
- ::libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
- libgav1_logging_internal_basename, __LINE__, \
- __VA_ARGS__); \
+#define LIBGAV1_DLOG(severity, ...) \
+ do { \
+ constexpr const char* libgav1_logging_internal_basename = \
+ libgav1::internal::Basename(__FILE__, sizeof(__FILE__) - 1); \
+ libgav1::internal::Log(LIBGAV1_LOGGING_INTERNAL_##severity, \
+ libgav1_logging_internal_basename, __LINE__, \
+ __VA_ARGS__); \
} while (0)
#else
#define LIBGAV1_DLOG(severity, ...) \
@@ -49,10 +49,10 @@
} while (0)
#endif // LIBGAV1_ENABLE_LOGGING
-#define LIBGAV1_LOGGING_INTERNAL_ERROR ::libgav1::internal::LogSeverity::kError
+#define LIBGAV1_LOGGING_INTERNAL_ERROR libgav1::internal::LogSeverity::kError
#define LIBGAV1_LOGGING_INTERNAL_WARNING \
- ::libgav1::internal::LogSeverity::kWarning
-#define LIBGAV1_LOGGING_INTERNAL_INFO ::libgav1::internal::LogSeverity::kInfo
+ libgav1::internal::LogSeverity::kWarning
+#define LIBGAV1_LOGGING_INTERNAL_INFO libgav1::internal::LogSeverity::kInfo
namespace libgav1 {
namespace internal {
diff --git a/libgav1/src/utils/memory.h b/libgav1/src/utils/memory.h
index 80c1d8c..a8da53b 100644
--- a/libgav1/src/utils/memory.h
+++ b/libgav1/src/utils/memory.h
@@ -34,8 +34,9 @@
enum {
// The byte alignment required for buffers used with SIMD code to be read or
// written with aligned operations.
-#if defined(__i386__) || defined(_M_IX86)
- kMaxAlignment = 16, // extended alignment is safe on x86.
+#if defined(__i386__) || defined(_M_IX86) || defined(__x86_64__) || \
+ defined(_M_X64)
+ kMaxAlignment = 32, // extended alignment is safe on x86.
#else
kMaxAlignment = alignof(max_align_t),
#endif
@@ -70,7 +71,7 @@
// more convenient to use memalign(). Unlike glibc, Android does not consider
// memalign() an obsolete function.
return memalign(alignment, size);
-#else // !defined(__ANDROID__)
+#else // !defined(__ANDROID__)
void* ptr = nullptr;
// posix_memalign requires that the requested alignment be at least
// sizeof(void*). In this case, fall back on malloc which should return
diff --git a/libgav1/src/utils/parameter_tree.cc b/libgav1/src/utils/parameter_tree.cc
deleted file mode 100644
index 9426ce6..0000000
--- a/libgav1/src/utils/parameter_tree.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2019 The libgav1 Authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "src/utils/parameter_tree.h"
-
-#include <cassert>
-#include <memory>
-#include <new>
-
-#include "src/utils/common.h"
-#include "src/utils/constants.h"
-#include "src/utils/logging.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-// static
-std::unique_ptr<ParameterTree> ParameterTree::Create(int row4x4, int column4x4,
- BlockSize block_size,
- bool is_leaf) {
- std::unique_ptr<ParameterTree> tree(
- new (std::nothrow) ParameterTree(row4x4, column4x4, block_size));
- if (tree != nullptr && is_leaf && !tree->SetPartitionType(kPartitionNone)) {
- tree = nullptr;
- }
- return tree;
-}
-
-bool ParameterTree::SetPartitionType(Partition partition) {
- assert(!partition_type_set_);
- partition_ = partition;
- partition_type_set_ = true;
- const int block_width4x4 = kNum4x4BlocksWide[block_size_];
- const int half_block4x4 = block_width4x4 >> 1;
- const int quarter_block4x4 = half_block4x4 >> 1;
- const BlockSize sub_size = kSubSize[partition][block_size_];
- const BlockSize split_size = kSubSize[kPartitionSplit][block_size_];
- assert(partition == kPartitionNone || sub_size != kBlockInvalid);
- switch (partition) {
- case kPartitionNone:
- parameters_.reset(new (std::nothrow) BlockParameters());
- return parameters_ != nullptr;
- case kPartitionHorizontal:
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr;
- case kPartitionVertical:
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr;
- case kPartitionSplit:
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, sub_size, false);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, false);
- children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, false);
- children_[3] = ParameterTree::Create(
- row4x4_ + half_block4x4, column4x4_ + half_block4x4, sub_size, false);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr && children_[3] != nullptr;
- case kPartitionHorizontalWithTopSplit:
- assert(split_size != kBlockInvalid);
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, split_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- split_size, true);
- children_[2] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionHorizontalWithBottomSplit:
- assert(split_size != kBlockInvalid);
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- split_size, true);
- children_[2] =
- ParameterTree::Create(row4x4_ + half_block4x4,
- column4x4_ + half_block4x4, split_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionVerticalWithLeftSplit:
- assert(split_size != kBlockInvalid);
- children_[0] =
- ParameterTree::Create(row4x4_, column4x4_, split_size, true);
- children_[1] = ParameterTree::Create(row4x4_ + half_block4x4, column4x4_,
- split_size, true);
- children_[2] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- sub_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionVerticalWithRightSplit:
- assert(split_size != kBlockInvalid);
- children_[0] = ParameterTree::Create(row4x4_, column4x4_, sub_size, true);
- children_[1] = ParameterTree::Create(row4x4_, column4x4_ + half_block4x4,
- split_size, true);
- children_[2] =
- ParameterTree::Create(row4x4_ + half_block4x4,
- column4x4_ + half_block4x4, split_size, true);
- return children_[0] != nullptr && children_[1] != nullptr &&
- children_[2] != nullptr;
- case kPartitionHorizontal4:
- for (int i = 0; i < 4; ++i) {
- children_[i] = ParameterTree::Create(row4x4_ + i * quarter_block4x4,
- column4x4_, sub_size, true);
- if (children_[i] == nullptr) return false;
- }
- return true;
- default:
- assert(partition == kPartitionVertical4);
- for (int i = 0; i < 4; ++i) {
- children_[i] = ParameterTree::Create(
- row4x4_, column4x4_ + i * quarter_block4x4, sub_size, true);
- if (children_[i] == nullptr) return false;
- }
- return true;
- }
-}
-
-} // namespace libgav1
diff --git a/libgav1/src/utils/parameter_tree.h b/libgav1/src/utils/parameter_tree.h
deleted file mode 100644
index 935f3eb..0000000
--- a/libgav1/src/utils/parameter_tree.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright 2019 The libgav1 Authors
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-#define LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
-
-#include <cassert>
-#include <memory>
-
-#include "src/utils/common.h"
-#include "src/utils/compiler_attributes.h"
-#include "src/utils/constants.h"
-#include "src/utils/memory.h"
-#include "src/utils/types.h"
-
-namespace libgav1 {
-
-class ParameterTree : public Allocable {
- public:
- // Creates a parameter tree to store the parameters of a block of size
- // |block_size| starting at coordinates |row4x4| and |column4x4|. If |is_leaf|
- // is set to true, the memory will be allocated for the BlockParameters for
- // this node. Otherwise, no memory will be allocated. If |is_leaf| is set to
- // false, |block_size| must be a square block, i.e.,
- // kBlockWidthPixels[block_size] must be equal to
- // kBlockHeightPixels[block_size].
- static std::unique_ptr<ParameterTree> Create(int row4x4, int column4x4,
- BlockSize block_size,
- bool is_leaf = false);
-
- // Move only (not Copyable).
- ParameterTree(ParameterTree&& other) = default;
- ParameterTree& operator=(ParameterTree&& other) = default;
- ParameterTree(const ParameterTree&) = delete;
- ParameterTree& operator=(const ParameterTree&) = delete;
-
- // Set the partition type of the current node to |partition|.
- // if (partition == kPartitionNone) {
- // Memory will be allocated for the BlockParameters for this node.
- // } else if (partition != kPartitionSplit) {
- // The appropriate child nodes will be populated and memory will be
- // allocated for the BlockParameters of the children.
- // } else {
- // The appropriate child nodes will be populated but they are considered to
- // be hanging, i.e., future calls to SetPartitionType() on the child nodes
- // will have to set them or their descendants to a terminal type.
- // }
- // This function must be called only once per node.
- LIBGAV1_MUST_USE_RESULT bool SetPartitionType(Partition partition);
-
- // Basic getters.
- int row4x4() const { return row4x4_; }
- int column4x4() const { return column4x4_; }
- BlockSize block_size() const { return block_size_; }
- Partition partition() const { return partition_; }
- ParameterTree* children(int index) const {
- assert(index < 4);
- return children_[index].get();
- }
- // Returns the BlockParameters object of the current node if one exists.
- // Otherwise returns nullptr. This function will return a valid
- // BlockParameters object only for leaf nodes.
- BlockParameters* parameters() const { return parameters_.get(); }
-
- private:
- ParameterTree(int row4x4, int column4x4, BlockSize block_size)
- : row4x4_(row4x4), column4x4_(column4x4), block_size_(block_size) {}
-
- Partition partition_ = kPartitionNone;
- std::unique_ptr<BlockParameters> parameters_ = nullptr;
- int row4x4_ = -1;
- int column4x4_ = -1;
- BlockSize block_size_ = kBlockInvalid;
- bool partition_type_set_ = false;
-
- // Child values are defined as follows for various partition types:
- // * Horizontal: 0 top partition; 1 bottom partition; 2 nullptr; 3 nullptr;
- // * Vertical: 0 left partition; 1 right partition; 2 nullptr; 3 nullptr;
- // * Split: 0 top-left partition; 1 top-right partition; 2; bottom-left
- // partition; 3 bottom-right partition;
- // * HorizontalWithTopSplit: 0 top-left partition; 1 top-right partition; 2
- // bottom partition; 3 nullptr;
- // * HorizontalWithBottomSplit: 0 top partition; 1 bottom-left partition; 2
- // bottom-right partition; 3 nullptr;
- // * VerticalWithLeftSplit: 0 top-left partition; 1 bottom-left partition; 2
- // right partition; 3 nullptr;
- // * VerticalWithRightSplit: 0 left-partition; 1 top-right partition; 2
- // bottom-right partition; 3 nullptr;
- // * Horizontal4: 0 top partition; 1 second top partition; 2 third top
- // partition; 3 bottom partition;
- // * Vertical4: 0 left partition; 1 second left partition; 2 third left
- // partition; 3 right partition;
- std::unique_ptr<ParameterTree> children_[4] = {};
-
- friend class ParameterTreeTest;
-};
-
-} // namespace libgav1
-
-#endif // LIBGAV1_SRC_UTILS_PARAMETER_TREE_H_
diff --git a/libgav1/src/utils/raw_bit_reader.h b/libgav1/src/utils/raw_bit_reader.h
index 76e7bfa..7d8ce8f 100644
--- a/libgav1/src/utils/raw_bit_reader.h
+++ b/libgav1/src/utils/raw_bit_reader.h
@@ -38,7 +38,7 @@
size_t* value); // le(n) in the spec.
bool ReadUnsignedLeb128(size_t* value); // leb128() in the spec.
// Reads a variable length unsigned number and stores it in |*value|. On a
- // successful return, |*value| is in the range of 0 to UINT32_MAX − 1,
+ // successful return, |*value| is in the range of 0 to UINT32_MAX - 1,
// inclusive.
bool ReadUvlc(uint32_t* value); // uvlc() in the spec.
bool Finished() const;
diff --git a/libgav1/src/utils/threadpool.cc b/libgav1/src/utils/threadpool.cc
index 8c8f4fe..a3099e1 100644
--- a/libgav1/src/utils/threadpool.cc
+++ b/libgav1/src/utils/threadpool.cc
@@ -37,17 +37,21 @@
#include <chrono> // NOLINT (unapproved c++11 header)
#endif
+// Define the GetTid() function, a wrapper for the gettid() system call in
+// Linux.
+#if defined(__ANDROID__)
+static pid_t GetTid() { return gettid(); }
+#elif defined(__GLIBC__)
// The glibc wrapper for the gettid() system call was added in glibc 2.30.
// Emulate it for older versions of glibc.
-#if defined(__GLIBC_PREREQ)
-#if !__GLIBC_PREREQ(2, 30)
-
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 30)
+static pid_t GetTid() { return gettid(); }
+#else // Older than glibc 2.30
#include <sys/syscall.h>
-static pid_t gettid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
-
-#endif
-#endif // defined(__GLIBC_PREREQ)
+static pid_t GetTid() { return static_cast<pid_t>(syscall(SYS_gettid)); }
+#endif // glibc 2.30 or later.
+#endif // defined(__GLIBC__)
namespace libgav1 {
@@ -216,7 +220,7 @@
// If the |name| buffer is longer than 16 bytes, pthread_setname_np fails
// with error 34 (ERANGE) on Android.
char name[16];
- pid_t id = gettid();
+ pid_t id = GetTid();
int rv = snprintf(name, sizeof(name), "%s/%" PRId64, pool_->name_prefix_,
static_cast<int64_t>(id));
assert(rv >= 0);
diff --git a/libgav1/src/utils/types.h b/libgav1/src/utils/types.h
index c0ac76c..eba13b7 100644
--- a/libgav1/src/utils/types.h
+++ b/libgav1/src/utils/types.h
@@ -18,6 +18,7 @@
#define LIBGAV1_SRC_UTILS_TYPES_H_
#include <array>
+#include <cstddef>
#include <cstdint>
#include <memory>
@@ -320,7 +321,7 @@
struct LoopRestoration {
LoopRestorationType type[kMaxPlanes];
- int unit_size[kMaxPlanes];
+ int unit_size_log2[kMaxPlanes];
};
// Stores the quantization parameters of Section 5.9.12.
@@ -512,6 +513,10 @@
Delta delta_lf;
// A valid value of reference_frame_index[i] is in the range [0, 7]. -1
// indicates an invalid value.
+ //
+ // NOTE: When the frame is an intra frame (frame_type is kFrameKey or
+ // kFrameIntraOnly), reference_frame_index is not used and may be
+ // uninitialized.
int8_t reference_frame_index[kNumInterReferenceFrameTypes];
// The ref_order_hint[ i ] syntax element in the uncompressed header.
// Specifies the expected output order hint for each reference frame.
@@ -521,5 +526,24 @@
FilmGrainParams film_grain_params;
};
+// Structure used for traversing the partition tree.
+struct PartitionTreeNode {
+ PartitionTreeNode() = default;
+ PartitionTreeNode(int row4x4, int column4x4, BlockSize block_size)
+ : row4x4(row4x4), column4x4(column4x4), block_size(block_size) {}
+ int row4x4 = -1;
+ int column4x4 = -1;
+ BlockSize block_size = kBlockInvalid;
+};
+
+// Structure used for storing the transform parameters in a superblock.
+struct TransformParameters {
+ TransformParameters() = default;
+ TransformParameters(TransformType type, int non_zero_coeff_count)
+ : type(type), non_zero_coeff_count(non_zero_coeff_count) {}
+ TransformType type;
+ int non_zero_coeff_count;
+};
+
} // namespace libgav1
#endif // LIBGAV1_SRC_UTILS_TYPES_H_
diff --git a/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc b/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
index 6e8b6a0..d1b1c54 100644
--- a/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
+++ b/libgav1/tests/fuzzer/decoder_fuzzer_frame_parallel.cc
@@ -121,14 +121,12 @@
const libgav1::DecoderBuffer* buffer;
libgav1::StatusCode status = decoder.DequeueFrame(&buffer);
- if (status != libgav1::kStatusOk &&
- status != libgav1::kStatusNothingToDequeue) {
- break;
- }
- if (buffer == nullptr) {
- dequeue_finished = status == libgav1::kStatusNothingToDequeue;
- } else {
+ if (status == libgav1::kStatusNothingToDequeue) {
+ dequeue_finished = true;
+ } else if (status == libgav1::kStatusOk) {
dequeue_finished = false;
+ } else {
+ break;
}
} while (input_buffer != nullptr || !file_reader->IsEndOfFile() ||
!dequeue_finished);