[automerger skipped] DO NOT MERGE - qt-qpr1-dev-plus-aosp-without-vendor@5915889 into stage-aosp-master am: 4f4e977699
am: 349a55e8d8 -s ours
am skip reason: subject contains skip directive
Change-Id: Iaa4e32b1be60d54cc32f1a5ba2cba5f881cb58fd
diff --git a/README.google b/README.google
deleted file mode 100644
index 285364e..0000000
--- a/README.google
+++ /dev/null
@@ -1,13 +0,0 @@
-URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: r1602
-License: BSD
-License File: LICENSE
-
-Description:
-libyuv is an open-source library for yuv scaling, conversion, comparison
-and rendering.
-Specifically libyuv is optimized for SSE2/SSSE3 and Neon and has demonstrated
-speed up to 10x to 16x compared to C code.
-
-Local Modifications:
-None
diff --git a/README.version b/README.version
index 6ce5d06..0e74ad1 100644
--- a/README.version
+++ b/README.version
@@ -1,3 +1,3 @@
-Version: r1652
+Version: r1732
BugComponent: 42195
Owner: lajos
diff --git a/files/Android.bp b/files/Android.bp
index 5898b68..ea76f94 100644
--- a/files/Android.bp
+++ b/files/Android.bp
@@ -11,6 +11,8 @@
"source/compare_gcc.cc",
"source/compare_neon.cc",
"source/compare_neon64.cc",
+ "source/compare_mmi.cc",
+ "source/compare_msa.cc",
"source/convert.cc",
"source/convert_argb.cc",
"source/convert_from.cc",
@@ -23,15 +25,15 @@
"source/rotate_any.cc",
"source/rotate_argb.cc",
"source/rotate_common.cc",
- "source/rotate_dspr2.cc",
"source/rotate_gcc.cc",
+ "source/rotate_mmi.cc",
"source/rotate_msa.cc",
"source/rotate_neon.cc",
"source/rotate_neon64.cc",
"source/row_any.cc",
"source/row_common.cc",
- "source/row_dspr2.cc",
"source/row_gcc.cc",
+ "source/row_mmi.cc",
"source/row_msa.cc",
"source/row_neon.cc",
"source/row_neon64.cc",
@@ -39,13 +41,12 @@
"source/scale_any.cc",
"source/scale_argb.cc",
"source/scale_common.cc",
- "source/scale_dspr2.cc",
"source/scale_gcc.cc",
+ "source/scale_mmi.cc",
"source/scale_msa.cc",
"source/scale_neon.cc",
"source/scale_neon64.cc",
"source/video_common.cc",
-
"source/convert_jpeg.cc",
"source/mjpeg_decoder.cc",
"source/mjpeg_validate.cc",
@@ -77,7 +78,6 @@
static_libs: ["libyuv"],
shared_libs: ["libjpeg"],
cflags: ["-Wall", "-Werror"],
-
srcs: [
"unit_test/unit_test.cc",
"unit_test/basictypes_test.cc",
@@ -85,6 +85,7 @@
"unit_test/compare_test.cc",
"unit_test/convert_test.cc",
"unit_test/cpu_test.cc",
+ "unit_test/cpu_thread_test.cc",
"unit_test/math_test.cc",
"unit_test/planar_test.cc",
"unit_test/rotate_argb_test.cc",
@@ -94,3 +95,42 @@
"unit_test/video_common_test.cc",
],
}
+
+cc_test {
+ name: "compare",
+ gtest: false,
+ srcs: [
+ "util/compare.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "cpuid",
+ gtest: false,
+ srcs: [
+ "util/cpuid.c",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "psnr",
+ gtest: false,
+ srcs: [
+ "util/psnr_main.cc",
+ "util/psnr.cc",
+ "util/ssim.cc",
+ ],
+ static_libs: ["libyuv"],
+}
+
+cc_test {
+ name: "yuvconvert",
+ gtest: false,
+ srcs: [
+ "util/yuvconvert.cc",
+ ],
+ static_libs: ["libyuv"],
+ shared_libs: ["libjpeg"],
+}
diff --git a/files/BUILD.gn b/files/BUILD.gn
index a50aab5..8904fd6 100644
--- a/files/BUILD.gn
+++ b/files/BUILD.gn
@@ -12,6 +12,11 @@
declare_args() {
# Set to false to disable building with gflags.
libyuv_use_gflags = true
+
+ # When building a shared library using a target in WebRTC or
+ # Chromium projects that depends on libyuv, setting this flag
+ # to true makes libyuv symbols visible inside that library.
+ libyuv_symbols_visible = false
}
config("libyuv_config") {
@@ -33,29 +38,52 @@
if (libyuv_include_tests) {
deps += [
":compare",
- ":yuvconvert",
":cpuid",
":libyuv_unittest",
":psnr",
+ ":yuvconvert",
]
}
}
group("libyuv") {
- public_configs = [ ":libyuv_config" ]
+ all_dependent_configs = [ ":libyuv_config" ]
+ deps = []
if (is_win && target_cpu == "x64") {
+ # Compile with clang in order to get inline assembly
public_deps = [
- ":libyuv_internal(//build/toolchain/win:clang_x64)",
+ ":libyuv_internal(//build/toolchain/win:win_clang_x64)",
]
} else {
public_deps = [
":libyuv_internal",
]
}
+
+ if (libyuv_use_neon) {
+ deps += [ ":libyuv_neon" ]
+ }
+
+ if (libyuv_use_msa) {
+ deps += [ ":libyuv_msa" ]
+ }
+
+ if (libyuv_use_mmi) {
+ deps += [ ":libyuv_mmi" ]
+ }
+
+ if (!is_ios) {
+ # Make sure that clients of libyuv link with libjpeg. This can't go in
+ # libyuv_internal because in Windows x64 builds that will generate a clang
+ # build of libjpeg, and we don't want two copies.
+ deps += [ "//third_party:jpeg" ]
+ }
}
static_library("libyuv_internal") {
+ visibility = [ ":*" ]
+
sources = [
# Headers
"include/libyuv.h",
@@ -98,19 +126,16 @@
"source/rotate_any.cc",
"source/rotate_argb.cc",
"source/rotate_common.cc",
- "source/rotate_dspr2.cc",
"source/rotate_gcc.cc",
"source/rotate_win.cc",
"source/row_any.cc",
"source/row_common.cc",
- "source/row_dspr2.cc",
"source/row_gcc.cc",
"source/row_win.cc",
"source/scale.cc",
"source/scale_any.cc",
"source/scale_argb.cc",
"source/scale_common.cc",
- "source/scale_dspr2.cc",
"source/scale_gcc.cc",
"source/scale_win.cc",
"source/video_common.cc",
@@ -120,17 +145,17 @@
defines = []
deps = []
+ if (libyuv_symbols_visible) {
+ configs -= [ "//build/config/gcc:symbol_visibility_hidden" ]
+ configs += [ "//build/config/gcc:symbol_visibility_default" ]
+ }
+
if (!is_ios) {
defines += [ "HAVE_JPEG" ]
- deps += [ "//third_party:jpeg" ]
- }
- if (libyuv_use_neon) {
- deps += [ ":libyuv_neon" ]
- }
-
- if (libyuv_use_msa) {
- deps += [ ":libyuv_msa" ]
+ # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps
+ # because in Windows x64 build it will get compiled with clang.
+ deps += [ "//third_party:jpeg_includes" ]
}
# Always enable optimization for Release and NaCl builds (to workaround
@@ -143,7 +168,14 @@
}
# To enable AVX2 or other cpu optimization, pass flag here
- # cflags = [ "-mavx2" ]
+ if (!is_win) {
+ cflags = [
+ # "-mpopcnt",
+ # "-mavx2",
+ # "-mfma",
+ "-ffp-contract=fast", # Enable fma vectorization for NEON.
+ ]
+ }
}
if (libyuv_use_neon) {
@@ -160,6 +192,10 @@
"source/scale_neon64.cc",
]
+ deps = [
+ ":libyuv_internal",
+ ]
+
public_configs = [ ":libyuv_config" ]
# Always enable optimization for Release and NaCl builds (to workaround
@@ -168,6 +204,7 @@
configs -= [ "//build/config/compiler:default_optimization" ]
# Enable optimize for speed (-O2) over size (-Os).
+ # TODO(fbarchard): Consider optimize_speed which is O3.
configs += [ "//build/config/compiler:optimize_max" ]
}
@@ -182,11 +219,34 @@
static_library("libyuv_msa") {
sources = [
# MSA Source Files
+ "source/compare_msa.cc",
"source/rotate_msa.cc",
"source/row_msa.cc",
"source/scale_msa.cc",
]
+ deps = [
+ ":libyuv_internal",
+ ]
+
+ public_configs = [ ":libyuv_config" ]
+ }
+}
+
+if (libyuv_use_mmi) {
+ static_library("libyuv_mmi") {
+ sources = [
+ # MMI Source Files
+ "source/compare_mmi.cc",
+ "source/rotate_mmi.cc",
+ "source/row_mmi.cc",
+ "source/scale_mmi.cc",
+ ]
+
+ deps = [
+ ":libyuv_internal",
+ ]
+
public_configs = [ ":libyuv_config" ]
}
}
@@ -222,6 +282,7 @@
"unit_test/compare_test.cc",
"unit_test/convert_test.cc",
"unit_test/cpu_test.cc",
+ "unit_test/cpu_thread_test.cc",
"unit_test/math_test.cc",
"unit_test/planar_test.cc",
"unit_test/rotate_argb_test.cc",
@@ -278,7 +339,6 @@
# Enable the following 3 macros to turn off assembly for specified CPU.
# "LIBYUV_DISABLE_X86",
# "LIBYUV_DISABLE_NEON",
- # "LIBYUV_DISABLE_DSPR2",
# Enable the following macro to build libyuv as a shared library (dll).
# "LIBYUV_USING_SHARED_LIBRARY"
]
diff --git a/files/CMakeLists.txt b/files/CMakeLists.txt
index 6420371..ed4948f 100644
--- a/files/CMakeLists.txt
+++ b/files/CMakeLists.txt
@@ -59,7 +59,11 @@
endif()
add_executable(libyuv_unittest ${ly_unittest_sources})
- target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY} pthread)
+ target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY})
+ find_library(PTHREAD_LIBRARY pthread)
+ if(NOT PTHREAD_LIBRARY STREQUAL "PTHREAD_LIBRARY-NOTFOUND")
+ target_link_libraries(libyuv_unittest pthread)
+ endif()
if (JPEG_FOUND)
target_link_libraries(libyuv_unittest ${JPEG_LIBRARY})
endif()
@@ -68,14 +72,18 @@
target_link_libraries(libyuv_unittest glibc-compat)
endif()
- target_link_libraries(libyuv_unittest gflags)
+ find_library(GFLAGS_LIBRARY gflags)
+ if(NOT GFLAGS_LIBRARY STREQUAL "GFLAGS_LIBRARY-NOTFOUND")
+ target_link_libraries(libyuv_unittest gflags)
+ add_definitions(-DLIBYUV_USE_GFLAGS)
+ endif()
endif()
# install the conversion tool, .so, .a, and all the header files
INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin )
INSTALL ( TARGETS ${ly_lib_static} DESTINATION lib )
-INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib )
+INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib RUNTIME DESTINATION bin )
INSTALL ( DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include )
# create the .deb and .rpm packages using cpack
diff --git a/files/DEPS b/files/DEPS
index 803f481..c5f81b8 100644
--- a/files/DEPS
+++ b/files/DEPS
@@ -1,44 +1,79 @@
vars = {
'chromium_git': 'https://chromium.googlesource.com',
- 'chromium_revision': 'da7cc8ca4c326895886b10df62d513fac256d74f',
- 'swarming_revision': '11e31afa5d330756ff87aa12064bb5d032896cb5',
+ 'chromium_revision': '4476bd69d1c8e4e1cde8633d3b33c992f7d3a6d0',
+ 'swarming_revision': '0e3e1c4dc4e79f25a5b58fcbc135dc93183c0c54',
# Three lines of non-changing comments so that
# the commit queue can handle CLs rolling lss
# and whatever else without interference from each other.
- 'lss_revision': '63f24c8221a229f677d26ebe8f3d1528a9d787ac',
+ 'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
# Three lines of non-changing comments so that
# the commit queue can handle CLs rolling catapult
# and whatever else without interference from each other.
- 'catapult_revision': '49eb11f63eb4d552d634833a01d2710208ba0523',
+ 'catapult_revision': 'a24a725f7834c16b3628bfb63f349b3480bf9592',
+ # the commit queue can handle CLs rolling android_sdk_build-tools_version
+ # and whatever else without interference from each other.
+ 'android_sdk_build-tools_version': 'DLK621q5_Bga5EsOr7cp6bHWWxFKx6UHLu_Ix_m3AckC',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_emulator_version
+ # and whatever else without interference from each other.
+ 'android_sdk_emulator_version': 'ki7EDQRAiZAUYlnTWR1XmI6cJTk65fJ-DNZUU1zrtS8C',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_extras_version
+ # and whatever else without interference from each other.
+ 'android_sdk_extras_version': 'iIwhhDox5E-mHgwUhCz8JACWQCpUjdqt5KTY9VLugKQC',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_patcher_version
+ # and whatever else without interference from each other.
+ 'android_sdk_patcher_version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_platform-tools_version
+ # and whatever else without interference from each other.
+ 'android_sdk_platform-tools_version': '4Y2Cb2LGzoc-qt-oIUIlhySotJaKeE3ELFedSVe6Uk8C',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_platforms_version
+ # and whatever else without interference from each other.
+ 'android_sdk_platforms_version': 'Kg2t9p0YnQk8bldUv4VA3o156uPXLUfIFAmVZ-Gm5ewC',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_sources_version
+ # and whatever else without interference from each other.
+ 'android_sdk_sources_version': 'K9uEn3JvNELEVjjVK_GQD3ZQD3rqAnJSxCWxjmUmRkgC',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_tools_version
+ # and whatever else without interference from each other.
+ 'android_sdk_tools_version': 'wYcRQC2WHsw2dKWs4EA7fw9Qsyzu1ds1_fRjKmGxe5QC',
+ # Three lines of non-changing comments so that
+ # the commit queue can handle CLs rolling android_sdk_tools-lint_version
+ # and whatever else without interference from each other.
+ 'android_sdk_tools-lint_version': '89hXqZYzCum3delB5RV7J_QyWkaRodqdtQS0s3LMh3wC',
}
deps = {
'src/build':
- Var('chromium_git') + '/chromium/src/build' + '@' + '15013685bdd59b8e548ffdef88b6fddae4f0e49c',
+ Var('chromium_git') + '/chromium/src/build' + '@' + '669e41d6f18842ed5740449662a71b715dc607c6',
'src/buildtools':
- Var('chromium_git') + '/chromium/buildtools.git' + '@' + '88811f48a6b79786ef35be86825642fc33011151',
+ Var('chromium_git') + '/chromium/buildtools.git' + '@' + '0e1cbc4eab6861b0c84bf2ed9a3c4b7aa2063819',
'src/testing':
- Var('chromium_git') + '/chromium/src/testing' + '@' + '6885521e316f349c47c4f23f5bce9bd44cb0eece',
- 'src/testing/gtest':
- Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '6f8a66431cb592dad629028a50b3dd418a408c87',
- 'src/testing/gmock':
- Var('chromium_git') + '/external/googlemock.git' + '@' + '0421b6f358139f02e102c9c332ce19a33faf75be', # from svn revision 566
+ Var('chromium_git') + '/chromium/src/testing' + '@' + 'b1c6aeebeabcc177a83ff0a33dc6c3ab03d4aa94',
'src/third_party':
- Var('chromium_git') + '/chromium/src/third_party' + '@' + 'be0a7d2accc6d1e69eb9fa005d98e1558b9227ff',
+ Var('chromium_git') + '/chromium/src/third_party' + '@' + 'be3e0fc18f2e9ea14d0e9369e539eae5986335fd',
'src/third_party/catapult':
- Var('chromium_git') + '/external/github.com/catapult-project/catapult.git' + '@' + Var('catapult_revision'),
+ Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
'src/third_party/colorama/src':
Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
+ 'src/third_party/freetype/src':
+ Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'd01e28f41f8810c8ea422b854f8722659589fa99',
+ 'src/third_party/googletest/src':
+ Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '879ac092fde0a19e1b3a61b2546b2a422b1528bc',
+ 'src/third_party/harfbuzz-ng/src':
+ Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '26c5b54fb09fb45e02c9c4618bcea4958c698953',
'src/third_party/libjpeg_turbo':
- Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '7260e4d8b8e1e40b17f03fafdf1cd83296900f76',
+ Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '61a2bbaa9aec89cb2c882d87ace6aba9aee49bb9',
'src/third_party/yasm/source/patched-yasm':
- Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '7da28c6c7c6a1387217352ce02b31754deb54d2a',
+ Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
'src/tools':
- Var('chromium_git') + '/chromium/src/tools' + '@' + '80ce3971a8a250e9d0180e38a29553273877166c',
- 'src/tools/gyp':
- Var('chromium_git') + '/external/gyp.git' + '@' + 'e7079f0e0e14108ab0dba58728ff219637458563',
- 'src/tools/swarming_client':
- Var('chromium_git') + '/external/swarming.client.git' + '@' + Var('swarming_revision'),
+ Var('chromium_git') + '/chromium/src/tools' + '@' + '419541c8352b3b75a99c9a5a7c0d1e7b92f3fcf7',
+ 'src/tools/swarming_client':
+ Var('chromium_git') + '/infra/luci/client-py.git' + '@' + Var('swarming_revision'),
# libyuv-only dependencies (not present in Chromium).
'src/third_party/gflags':
@@ -47,44 +82,780 @@
Var('chromium_git') + '/external/github.com/gflags/gflags' + '@' + '03bebcb065c83beff83d50ae025a55a4bf94dfca',
'src/third_party/gtest-parallel':
Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
-}
-deps_os = {
- 'android': {
- 'src/base':
- Var('chromium_git') + '/chromium/src/base' + '@' + '636a52bf9d2ab7ea4e97d0a933bbd5706acbbc7c',
- 'src/third_party/android_tools':
- Var('chromium_git') + '/android_tools.git' + '@' + 'b65c4776dac2cf1b80e969b3b2d4e081b9c84f29',
- 'src/third_party/ced/src':
- Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'e21eb6aed10b9f6e2727f136c52420033214d458',
- 'src/third_party/icu':
- Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'b34251f8b762f8e2112a89c587855ca4297fed96',
- 'src/third_party/jsr-305/src':
- Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
- 'src/third_party/junit/src':
- Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
- 'src/third_party/lss':
- Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
- 'src/third_party/mockito/src':
- Var('chromium_git') + '/external/mockito/mockito.git' + '@' + 'de83ad4598ad4cf5ea53c69a8a8053780b04b850',
- 'src/third_party/requests/src':
- Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
- 'src/third_party/robolectric/robolectric':
- Var('chromium_git') + '/external/robolectric.git' + '@' + 'e38b49a12fdfa17a94f0382cc8ffaf69132fd09b',
+ 'src/third_party/lss': {
+ 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ 'condition': 'checkout_android or checkout_linux',
},
- 'ios': {
- 'src/ios':
- Var('chromium_git') + '/chromium/src/ios' + '@' + '9d4d917abc902ad9eb512839948b880194f76338',
+
+ # Android deps:
+ 'src/third_party/accessibility_test_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/accessibility-test-framework',
+ 'version': 'version:2.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
},
- 'unix': {
- 'src/third_party/lss':
- Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+ 'src/third_party/auto/src': {
+ 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
+ 'condition': 'checkout_android',
},
- 'win': {
- # Dependencies used by libjpeg-turbo
- 'src/third_party/yasm/binaries':
- Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
+ 'src/base': {
+ 'url': Var('chromium_git') + '/chromium/src/base' + '@' + '162a5d66ad148f26bbbe6b6ecaf5c1bafa2173e6',
+ 'condition': 'checkout_android',
},
+ 'src/third_party/bazel': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/bazel',
+ 'version': 'version:0.10.0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/bouncycastle': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/bouncycastle',
+ 'version': 'version:1.46-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_ndk': {
+ 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '4e2cea441bfd43f0863d14f57b1e1844260b9884',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/android_support_test_runner': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_support_test_runner',
+ 'version': 'version:0.5-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_tools': {
+ 'url': Var('chromium_git') + '/android_tools.git' + '@' + 'e958d6ea74442d4e0849bb8a018d215a0e78981d',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/android_sdk/public': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_sdk/public/build-tools',
+ 'version': Var('android_sdk_build-tools_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/emulator',
+ 'version': Var('android_sdk_emulator_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/extras',
+ 'version': Var('android_sdk_extras_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/patcher',
+ 'version': Var('android_sdk_patcher_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platform-tools',
+ 'version': Var('android_sdk_platform-tools_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/platforms',
+ 'version': Var('android_sdk_platforms_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/sources',
+ 'version': Var('android_sdk_sources_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/tools',
+ 'version': Var('android_sdk_tools_version'),
+ },
+ {
+ 'package': 'chromium/third_party/android_sdk/public/tools-lint',
+ 'version': Var('android_sdk_tools-lint_version'),
+ },
+ ],
+ 'condition': 'checkout_android_native_support',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/android_build_tools/aapt2': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_tools_aapt2',
+ 'version': 'version:3.2.0-alpha18-4804415-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/byte_buddy': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/byte_buddy',
+ 'version': 'version:1.4.17-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ced/src': {
+ 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/errorprone/lib': {
+ 'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/findbugs': {
+ 'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/gson': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/gson',
+ 'version': 'version:2.8.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/guava': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/guava',
+ 'version': 'version:23.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/hamcrest': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/hamcrest',
+ 'version': 'version:1.3-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/icu': {
+ 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd65301491c513d49163ad29c853eb85c02c8d5b4',
+ },
+ 'src/third_party/icu4j': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/icu4j',
+ 'version': 'version:53.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/intellij': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/intellij',
+ 'version': 'version:12.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/jsr-305/src': {
+ 'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/junit/src': {
+ 'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/mockito/src': {
+ 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/objenesis': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/objenesis',
+ 'version': 'version:2.4-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ow2_asm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/ow2_asm',
+ 'version': 'version:5.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/r8': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/r8',
+ 'version': 'version:1.0.30',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/proguard': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/proguard',
+ 'version': '3bd778c422ea5496de2ef25c007a517dbb5ce5ca',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/requests/src': {
+ 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'f172b30356d821d180fa4ecfa3e71c7274a32de4',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/robolectric': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/robolectric',
+ 'version': 'version:3.5.1',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/robolectric/robolectric': {
+ 'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/sqlite4java': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/sqlite4java',
+ 'version': 'version:0.282-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+ 'src/third_party/ub-uiautomator/lib': {
+ 'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434',
+ 'condition': 'checkout_android',
+ },
+ 'src/third_party/xstream': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/xstream',
+ 'version': 'version:1.4.8-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ # iOS deps:
+ 'src/ios': {
+ 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '44be3c093cf2db7ab4cf1997d6a1a07722f1f391',
+ 'condition': 'checkout_ios'
+ },
+
+ # Win deps:
+ # Dependencies used by libjpeg-turbo
+ 'src/third_party/yasm/binaries': {
+ 'url': Var('chromium_git') + '/chromium/deps/yasm/binaries.git' + '@' + '52f9b3f4b0aa06da24ef8b123058bb61ee468881',
+ 'condition': 'checkout_win',
+ },
+
+ # === ANDROID_DEPS Generated Code Start ===
+ # Generated by //tools/android/roll/android_deps/fetch_all.sh
+ 'src/third_party/android_deps/libs/android_arch_core_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_cardview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_design': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_gridlayout_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_leanback_v17': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_mediarouter_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_multidex': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex',
+ 'version': 'version:1.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_palette_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_leanback_v17': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_v14': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_preference_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_annotations': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_ui': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_core_utils': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_fragment': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_media_compat': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_v13': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_v4': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_android_support_transition': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
+ 'version': 'version:27.0.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
+ 'version': 'version:12.0.1-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_google_android_play_core': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
+ 'version': 'version:1.3.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ 'src/third_party/android_deps/libs/com_squareup_javapoet': {
+ 'packages': [
+ {
+ 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet',
+ 'version': 'version:1.11.0-cr0',
+ },
+ ],
+ 'condition': 'checkout_android',
+ 'dep_type': 'cipd',
+ },
+
+ # === ANDROID_DEPS Generated Code End ===
}
# Define rules for which include paths are allowed in our source.
@@ -116,180 +887,41 @@
'src',
],
},
- # Android dependencies. Many are downloaded using Google Storage these days.
- # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
- # such dependencies we share with Chromium.
+ # Downloads the current stable linux sysroot to build/linux/ if needed.
{
- # This downloads SDK extras and puts them in the
- # third_party/android_tools/sdk/extras directory.
- 'name': 'sdkextras',
+ 'name': 'sysroot_arm',
'pattern': '.',
- # When adding a new sdk extras package to download, add the package
- # directory and zip file to .gitignore in third_party/android_tools.
- 'action': ['python',
- 'src/build/android/play_services/update.py',
- 'download'
- ],
- },
- {
- 'name': 'intellij',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-intellij',
- '-l', 'third_party/intellij'
- ],
- },
- {
- 'name': 'javax_inject',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-javax-inject',
- '-l', 'third_party/javax_inject'
- ],
- },
- {
- 'name': 'hamcrest',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-hamcrest',
- '-l', 'third_party/hamcrest'
- ],
- },
- {
- 'name': 'guava',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-guava',
- '-l', 'third_party/guava'
- ],
- },
- {
- 'name': 'android_support_test_runner',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-android-support-test-runner',
- '-l', 'third_party/android_support_test_runner'
- ],
- },
- {
- 'name': 'byte_buddy',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-byte-buddy',
- '-l', 'third_party/byte_buddy'
- ],
- },
- {
- 'name': 'espresso',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-espresso',
- '-l', 'third_party/espresso'
- ],
- },
- {
- 'name': 'robolectric_libs',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-robolectric',
- '-l', 'third_party/robolectric'
- ],
- },
- {
- 'name': 'apache_velocity',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-apache-velocity',
- '-l', 'third_party/apache_velocity'
- ],
- },
- {
- 'name': 'ow2_asm',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-ow2-asm',
- '-l', 'third_party/ow2_asm'
- ],
- },
- {
- 'name': 'icu4j',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-icu4j',
- '-l', 'third_party/icu4j'
- ],
- },
- {
- 'name': 'accessibility_test_framework',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-accessibility-test-framework',
- '-l', 'third_party/accessibility_test_framework'
- ],
- },
- {
- 'name': 'bouncycastle',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-bouncycastle',
- '-l', 'third_party/bouncycastle'
- ],
- },
- {
- 'name': 'sqlite4java',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-sqlite4java',
- '-l', 'third_party/sqlite4java'
- ],
- },
- {
- 'name': 'objenesis',
- 'pattern': '.',
- 'action': ['python',
- 'src/build/android/update_deps/update_third_party_deps.py',
- 'download',
- '-b', 'chromium-objenesis',
- '-l', 'third_party/objenesis'
- ],
- },
- {
- # Downloads the current stable linux sysroot to build/linux/ if needed.
- # This sysroot updates at about the same rate that the chrome build deps
- # change. This script is a no-op except for linux users who are doing
- # official chrome builds or cross compiling.
- 'name': 'sysroot',
- 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_arm',
'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
- '--running-as-hook'],
+ '--arch=arm'],
+ },
+ {
+ 'name': 'sysroot_arm64',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_arm64',
+ 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=arm64'],
+ },
+ {
+ 'name': 'sysroot_x86',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and (checkout_x86 or checkout_x64)',
+ 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=x86'],
+ },
+ {
+ 'name': 'sysroot_mips',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_mips',
+ 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=mips'],
+ },
+ {
+ 'name': 'sysroot_x64',
+ 'pattern': '.',
+ 'condition': 'checkout_linux and checkout_x64',
+ 'action': ['python', 'src/build/linux/sysroot_scripts/install-sysroot.py',
+ '--arch=x64'],
},
{
# Update the Windows toolchain if necessary.
@@ -297,6 +929,12 @@
'pattern': '.',
'action': ['python', 'src/build/vs_toolchain.py', 'update'],
},
+ {
+ # Update the Mac toolchain if necessary.
+ 'name': 'mac_toolchain',
+ 'pattern': '.',
+ 'action': ['python', 'src/build/mac_toolchain.py'],
+ },
# Pull binutils for linux, enabled debug fission for faster linking /
# debugging when used with clang on Ubuntu Precise.
# https://code.google.com/p/chromium/issues/detail?id=352046
@@ -313,7 +951,7 @@
# Note: On Win, this should run after win_toolchain, as it may use it.
'name': 'clang',
'pattern': '.',
- 'action': ['python', 'src/tools/clang/scripts/update.py', '--if-needed'],
+ 'action': ['python', 'src/tools/clang/scripts/update.py'],
},
{
# Update LASTCHANGE.
@@ -425,10 +1063,29 @@
],
},
{
- 'name': 'clang_format_merge_driver',
+ # We used to use src as a CIPD root. We moved it to a different directory
+ # in crrev.com/c/930178 but left the clobber here to ensure that that CL
+ # could be reverted safely. This can be safely removed once crbug.com/794764
+ # is resolved.
+ 'name': 'Android Clobber Deprecated CIPD Root',
'pattern': '.',
- 'action': [ 'python',
- 'src/tools/clang_format_merge_driver/install_git_hook.py',
+ 'condition': 'checkout_android',
+ 'action': ['src/build/cipd/clobber_cipd_root.py',
+ '--root', 'src',
+ ],
+ },
+ # Android dependencies. Many are downloaded using Google Storage these days.
+ # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
+ # such dependencies we share with Chromium.
+ {
+ # This downloads SDK extras and puts them in the
+ # third_party/android_sdk/public/extras directory.
+ 'name': 'sdkextras',
+ 'condition': 'checkout_android',
+ 'pattern': '.',
+ 'action': ['vpython',
+ 'src/build/android/play_services/update.py',
+ 'download'
],
},
]
@@ -436,6 +1093,4 @@
recursedeps = [
# buildtools provides clang_format, libc++, and libc++abi.
'src/buildtools',
- # android_tools manages the NDK.
- 'src/third_party/android_tools',
]
diff --git a/files/OWNERS b/files/OWNERS
deleted file mode 100644
index 2db52d3..0000000
--- a/files/OWNERS
+++ /dev/null
@@ -1,13 +0,0 @@
-fbarchard@chromium.org
-magjed@chromium.org
-torbjorng@chromium.org
-
-per-file *.gyp=kjellander@chromium.org
-per-file *.gn=kjellander@chromium.org
-per-file .gitignore=*
-per-file AUTHORS=*
-per-file DEPS=*
-per-file PRESUBMIT.py=kjellander@chromium.org
-per-file gyp_libyuv.py=kjellander@chromium.org
-per-file setup_links.py=*
-per-file sync_chromium.py=kjellander@chromium.org
diff --git a/files/README.chromium b/files/README.chromium
index 8a0f066..bddc202 100644
--- a/files/README.chromium
+++ b/files/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1652
+Version: 1732
License: BSD
License File: LICENSE
diff --git a/files/README.md b/files/README.md
index b59b71c..db70b7f 100644
--- a/files/README.md
+++ b/files/README.md
@@ -1,18 +1,18 @@
**libyuv** is an open source project that includes YUV scaling and conversion functionality.
* Scale YUV to prepare content for compression, with point, bilinear or box filter.
-* Convert to YUV from webcam formats.
-* Convert from YUV to formats for rendering/effects.
+* Convert to YUV from webcam formats for compression.
+* Convert to RGB formats for rendering/effects.
* Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode.
-* Optimized for SSE2/SSSE3/AVX2 on x86/x64.
+* Optimized for SSSE3/AVX2 on x86/x64.
* Optimized for Neon on Arm.
-* Optimized for DSP R2 on Mips.
+* Optimized for MSA on Mips.
### Development
-See [Getting started] [1] for instructions on how to get started developing.
+See [Getting started][1] for instructions on how to get started developing.
-You can also browse the [docs directory] [2] for more documentation.
+You can also browse the [docs directory][2] for more documentation.
-[1]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/getting_started.md
-[2]: https://chromium.googlesource.com/libyuv/libyuv/+/master/docs/
+[1]: ./docs/getting_started.md
+[2]: ./docs/
diff --git a/files/build_overrides/build.gni b/files/build_overrides/build.gni
index 0a6affb..6d8319b 100644
--- a/files/build_overrides/build.gni
+++ b/files/build_overrides/build.gni
@@ -6,14 +6,6 @@
# in the file PATENTS. All contributing project authors may
# be found in the AUTHORS file in the root of the source tree.
-# Using same overrides as WebRTC
-# See https://bugs.chromium.org/p/webrtc/issues/detail?id=5453.
-# Some WebRTC targets require the 10.7 deployment version of the Mac SDK and a
-# 10.11 min SDK but those targets are only used in non-Chromium builds. We can
-# remove this when Chromium drops 10.6 support and also requires 10.7.
-mac_sdk_min_build_override = "10.11"
-mac_deployment_target_build_override = "10.7"
-
# Some non-Chromium builds don't use Chromium's third_party/binutils.
linux_use_bundled_binutils_override = true
@@ -41,6 +33,14 @@
# so we just ignore that assert. See https://crbug.com/648948 for more info.
ignore_elf32_limitations = true
-# Use system Xcode installation instead of the Chromium bundled Mac toolchain,
-# since it contains only SDK 10.11, not 10.12 which WebRTC needs.
-use_system_xcode = true
+# Use bundled hermetic Xcode installation maintained by Chromium,
+# except for local iOS builds where it is unsupported.
+if (host_os == "mac") {
+ _result = exec_script("//build/mac/should_use_hermetic_xcode.py",
+ [ target_os ],
+ "value")
+ assert(_result != 2,
+ "Do not allow building targets with the default" +
+ "hermetic toolchain if the minimum OS version is not met.")
+ use_system_xcode = _result == 0
+}
diff --git a/files/docs/deprecated_builds.md b/files/docs/deprecated_builds.md
index d54a028..29e0bf9 100644
--- a/files/docs/deprecated_builds.md
+++ b/files/docs/deprecated_builds.md
@@ -165,11 +165,11 @@
arm32 disassembly:
- third_party/android_tools/ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
+ third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o
arm64 disassembly:
- third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
+ third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o
Running tests:
diff --git a/files/docs/environment_variables.md b/files/docs/environment_variables.md
index 5802599..cd8159a 100644
--- a/files/docs/environment_variables.md
+++ b/files/docs/environment_variables.md
@@ -6,7 +6,10 @@
By default the cpu is detected and the most advanced form of SIMD is used. But you can disable instruction sets selectively, or completely, falling back on C code. Set the variable to 1 to disable the specified instruction set.
+## All CPUs
LIBYUV_DISABLE_ASM
+
+## Intel CPUs
LIBYUV_DISABLE_X86
LIBYUV_DISABLE_SSE2
LIBYUV_DISABLE_SSSE3
@@ -14,12 +17,25 @@
LIBYUV_DISABLE_SSE42
LIBYUV_DISABLE_AVX
LIBYUV_DISABLE_AVX2
- LIBYUV_DISABLE_AVX3
LIBYUV_DISABLE_ERMS
LIBYUV_DISABLE_FMA3
- LIBYUV_DISABLE_DSPR2
+ LIBYUV_DISABLE_F16C
+ LIBYUV_DISABLE_AVX512BW
+ LIBYUV_DISABLE_AVX512VL
+ LIBYUV_DISABLE_AVX512VBMI
+ LIBYUV_DISABLE_AVX512VBMI2
+ LIBYUV_DISABLE_AVX512VBITALG
+ LIBYUV_DISABLE_AVX512VPOPCNTDQ
+ LIBYUV_DISABLE_GFNI
+
+## ARM CPUs
+
LIBYUV_DISABLE_NEON
+## MIPS CPUs
+ LIBYUV_DISABLE_MSA
+ LIBYUV_DISABLE_MMI
+
# Test Width/Height/Repeat
The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions.
diff --git a/files/docs/formats.md b/files/docs/formats.md
index cddfe02..97e8ce0 100644
--- a/files/docs/formats.md
+++ b/files/docs/formats.md
@@ -35,9 +35,8 @@
# FOURCC (Four Charactacter Code) List
The following is extracted from video_common.h as a complete list of formats supported by libyuv.
-
enum FourCC {
- // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
@@ -46,37 +45,36 @@
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+ FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
- // 1 Secondary YUV formats: row biplanar.
+ // 1 Secondary YUV format: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
+ FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
- FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
- // 4 Secondary RGB formats: 4 Bayer Patterns.
- FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
- FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
- FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
- FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
- FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
+ FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
+ FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -97,9 +95,6 @@
FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
- // 1 Auxiliary compressed YUV format set aside for capturer.
- FOURCC_H264 = FOURCC('H', '2', '6', '4'),
-
# Planar YUV
The following formats contains a full size Y plane followed by 1 or 2
planes for UV: I420, I422, I444, I400, NV21, NV12, I400
@@ -138,3 +133,31 @@
Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro).
ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha. Other functions don't care.
+
+# RGB24 and RAW
+
+There are 2 RGB layouts - RGB24 (aka 24BG) and RAW
+
+RGB24 is B,G,R in memory
+RAW is R,G,B in memory
+
+# AR30 and XR30
+
+AR30 is 2 10 10 10 ARGB stored in little endian order.
+The 2 bit alpha has 4 values. Here are the comparable 8 bit alpha values.
+0 - 0. 00000000b = 0x00 = 0
+1 - 33%. 01010101b = 0x55 = 85
+2 - 66%. 10101010b = 0xaa = 170
+3 - 100%. 11111111b = 0xff = 255
+The 10 bit RGB values range from 0 to 1023.
+XR30 is the same as AR30 but with no alpha channel.
+
+# NV12 and NV21
+
+NV12 is a biplanar format with a full sized Y plane followed by a single
+chroma plane with weaved U and V values.
+NV21 is the same but with weaved V and U values.
+The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
+height chroma channel, and therefore is a 420 subsampling.
+NV16 is 16 bits per pixel, with half width and full height. aka 422.
+NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
diff --git a/files/docs/getting_started.md b/files/docs/getting_started.md
index 5df4c30..4426b60 100644
--- a/files/docs/getting_started.md
+++ b/files/docs/getting_started.md
@@ -27,7 +27,7 @@
},
];
-For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.`
+For iOS add `;target_os=['ios'];` to your OSX .gclient and run `gclient sync.`
Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master
@@ -48,11 +48,8 @@
Then run:
- export GYP_DEFINES="OS=android"
gclient sync
-The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks`
-
To get just the source (not buildable):
git clone https://chromium.googlesource.com/libyuv/libyuv
@@ -62,30 +59,15 @@
### Windows
- call gn gen out/Release "--args=is_debug=false target_cpu=\"x86\""
- call gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\""
- ninja -v -C out/Release
- ninja -v -C out/Debug
+ call gn gen out\Release "--args=is_debug=false target_cpu=\"x64\""
+ call gn gen out\Debug "--args=is_debug=true target_cpu=\"x64\""
+ ninja -v -C out\Release
+ ninja -v -C out\Debug
- call gn gen out/Release "--args=is_debug=false target_cpu=\"x64\""
- call gn gen out/Debug "--args=is_debug=true target_cpu=\"x64\""
- ninja -v -C out/Release
- ninja -v -C out/Debug
-
-#### Building with clang-cl
-
- set GYP_DEFINES=clang=1 target_arch=ia32
- call python tools\clang\scripts\update.py
-
- call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x86\""
- call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x86\""
- ninja -v -C out/Release
- ninja -v -C out/Debug
-
- call gn gen out/Release "--args=is_debug=false is_official_build=false is_clang=true target_cpu=\"x64\""
- call gn gen out/Debug "--args=is_debug=true is_official_build=false is_clang=true target_cpu=\"x64\""
- ninja -v -C out/Release
- ninja -v -C out/Debug
+ call gn gen out\Release "--args=is_debug=false target_cpu=\"x86\""
+ call gn gen out\Debug "--args=is_debug=true target_cpu=\"x86\""
+ ninja -v -C out\Release
+ ninja -v -C out\Debug
### macOS and Linux
@@ -113,23 +95,20 @@
ios simulator
- gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
- gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"x86\""
+ gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false use_xcode_clang=true target_cpu=\"x86\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false use_xcode_clang=true target_cpu=\"x86\""
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
+ios disassembly
+
+ otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+
### Android
https://code.google.com/p/chromium/wiki/AndroidBuildInstructions
Add to .gclient last line: `target_os=['android'];`
-armv7
-
- gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\""
- gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\""
- ninja -v -C out/Debug libyuv_unittest
- ninja -v -C out/Release libyuv_unittest
-
arm64
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\""
@@ -137,6 +116,13 @@
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
+armv7
+
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\""
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\""
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
ia32
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"x86\""
@@ -144,44 +130,41 @@
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
-mipsel
+mips
- gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
- gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
- ninja -v -C out/Debug libyuv_unittest
- ninja -v -C out/Release libyuv_unittest
-
- gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
- gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+ gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
+ gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true"
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
arm disassembly:
- third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
+ third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt
- third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
+ third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt
- third_party/android_tools/ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+ third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt
+
+ Caveat: Disassembly may require optimize_max be disabled in BUILD.gn
Running tests:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=*
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=*
Running test as benchmark:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1"
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Running test with C code:
- build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1"
+ out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1
### Build targets
ninja -C out/Debug libyuv
ninja -C out/Debug libyuv_unittest
ninja -C out/Debug compare
- ninja -C out/Debug convert
+ ninja -C out/Debug yuvconvert
ninja -C out/Debug psnr
ninja -C out/Debug cpuid
@@ -192,6 +175,15 @@
ninja -v -C out/Debug libyuv_unittest
ninja -v -C out/Release libyuv_unittest
+### MIPS Linux
+
+mips
+
+ gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+ gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false"
+ ninja -v -C out/Debug libyuv_unittest
+ ninja -v -C out/Release libyuv_unittest
+
## Building the Library with make
### Linux
@@ -251,16 +243,11 @@
out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*"
-### OSX
+### macOS and Linux
out/Release/libyuv_unittest --gtest_filter="*"
-### Linux
-
- out/Release/libyuv_unittest --gtest_filter="*"
-
-Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g.
-
+Replace --gtest_filter="*" with specific unittest to run. May include wildcards.
out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt
## CPU Emulator tools
@@ -275,12 +262,20 @@
~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt
+### Intel Architecture Code Analyzer
+
+Inset these 2 macros into assembly code to be analyzed:
+ IACA_ASM_START
+ IACA_ASM_END
+Build the code as usual, then run iaca on the object file.
+ ~/iaca-lin64/bin/iaca.sh -reduceout -arch HSW out/Release/obj/libyuv_internal/compare_gcc.o
+
## Sanitizers
- gn gen out/Debug "--args=is_debug=true is_asan=true"
- ninja -v -C out/Debug
+ gn gen out/Release "--args=is_debug=false is_msan=true"
+ ninja -v -C out/Release
- Sanitizers available: tsan, msan, asan, ubsan, lsan
+Sanitizers available: asan, msan, tsan, ubsan, lsan, ubsan_vptr
### Running Dr Memory memcheck for Windows
diff --git a/files/include/libyuv/basic_types.h b/files/include/libyuv/basic_types.h
index 7d98bb9..1bea67f 100644
--- a/files/include/libyuv/basic_types.h
+++ b/files/include/libyuv/basic_types.h
@@ -11,79 +11,36 @@
#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
#define INCLUDE_LIBYUV_BASIC_TYPES_H_
-#include <stddef.h> // for NULL, size_t
+#include <stddef.h> // For size_t and NULL
+
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
#if defined(_MSC_VER) && (_MSC_VER < 1600)
#include <sys/types.h> // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
#else
-#include <stdint.h> // for uintptr_t
-#endif
-
-#ifndef GG_LONGLONG
-#ifndef INT_TYPES_DEFINED
-#define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef unsigned __int64 uint64;
-typedef __int64 int64;
-#ifndef INT64_C
-#define INT64_C(x) x##I64
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x##UI64
-#endif
-#define INT64_F "I64"
-#else // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64; // NOLINT
-typedef long int64; // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x##L
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x##UL
-#endif
-#define INT64_F "l"
-#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64; // NOLINT
-typedef long long int64; // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x##LL
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x##ULL
-#endif
-#define INT64_F "ll"
-#endif // __LP64__
-#endif // COMPILER_MSVC
-typedef unsigned int uint32;
-typedef int int32;
-typedef unsigned short uint16; // NOLINT
-typedef short int16; // NOLINT
-typedef unsigned char uint8;
-typedef signed char int8;
+#include <stdint.h> // for uintptr_t and C99 types
+#endif // defined(_MSC_VER) && (_MSC_VER < 1600)
+// Types are deprecated. Enable this macro for legacy types.
+#ifdef LIBYUV_LEGACY_TYPES
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
+#endif // LIBYUV_LEGACY_TYPES
#endif // INT_TYPES_DEFINED
-#endif // GG_LONGLONG
-
-// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
- defined(_M_IX86)
-#define CPU_X86 1
-#endif
-// Detect compiler is for ARM.
-#if defined(__arm__) || defined(_M_ARM)
-#define CPU_ARM 1
-#endif
-
-#ifndef ALIGNP
-#ifdef __cplusplus
-#define ALIGNP(p, t) \
- reinterpret_cast<uint8*>( \
- ((reinterpret_cast<uintptr_t>(p) + ((t)-1)) & ~((t)-1)))
-#else
-#define ALIGNP(p, t) \
- (uint8*)((((uintptr_t)(p) + ((t)-1)) & ~((t)-1))) /* NOLINT */
-#endif
-#endif
#if !defined(LIBYUV_API)
#if defined(_WIN32) || defined(__CYGWIN__)
@@ -103,15 +60,9 @@
#endif // __GNUC__
#endif // LIBYUV_API
+// TODO(fbarchard): Remove bool macros.
#define LIBYUV_BOOL int
#define LIBYUV_FALSE 0
#define LIBYUV_TRUE 1
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
- defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
- (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/files/include/libyuv/compare.h b/files/include/libyuv/compare.h
index 4deca97..3353ad7 100644
--- a/files/include/libyuv/compare.h
+++ b/files/include/libyuv/compare.h
@@ -20,74 +20,85 @@
// Compute a hash for specified memory. Seed of 5381 recommended.
LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+uint32_t ARGBDetect(const uint8_t* argb,
+ int stride_argb,
+ int width,
+ int height);
// Sum Square Error - used to compute Mean Square Error or PSNR.
LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count);
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
- int stride_a,
- const uint8* src_b,
- int stride_b,
- int width,
- int height);
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height);
static const int kMaxPsnr = 128;
LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
LIBYUV_API
-double CalcFramePsnr(const uint8* src_a,
+double CalcFramePsnr(const uint8_t* src_a,
int stride_a,
- const uint8* src_b,
+ const uint8_t* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
-double I420Psnr(const uint8* src_y_a,
+double I420Psnr(const uint8_t* src_y_a,
int stride_y_a,
- const uint8* src_u_a,
+ const uint8_t* src_u_a,
int stride_u_a,
- const uint8* src_v_a,
+ const uint8_t* src_v_a,
int stride_v_a,
- const uint8* src_y_b,
+ const uint8_t* src_y_b,
int stride_y_b,
- const uint8* src_u_b,
+ const uint8_t* src_u_b,
int stride_u_b,
- const uint8* src_v_b,
+ const uint8_t* src_v_b,
int stride_v_b,
int width,
int height);
LIBYUV_API
-double CalcFrameSsim(const uint8* src_a,
+double CalcFrameSsim(const uint8_t* src_a,
int stride_a,
- const uint8* src_b,
+ const uint8_t* src_b,
int stride_b,
int width,
int height);
LIBYUV_API
-double I420Ssim(const uint8* src_y_a,
+double I420Ssim(const uint8_t* src_y_a,
int stride_y_a,
- const uint8* src_u_a,
+ const uint8_t* src_u_a,
int stride_u_a,
- const uint8* src_v_a,
+ const uint8_t* src_v_a,
int stride_v_a,
- const uint8* src_y_b,
+ const uint8_t* src_y_b,
int stride_y_b,
- const uint8* src_u_b,
+ const uint8_t* src_u_b,
int stride_u_b,
- const uint8* src_v_b,
+ const uint8_t* src_v_b,
int stride_v_b,
int width,
int height);
diff --git a/files/include/libyuv/compare_row.h b/files/include/libyuv/compare_row.h
index 7abc2d4..e95b9d9 100644
--- a/files/include/libyuv/compare_row.h
+++ b/files/include/libyuv/compare_row.h
@@ -18,17 +18,20 @@
extern "C" {
#endif
-#if defined(__pnacl__) || defined(__CLR_VER) || \
- (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
-
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
_MSC_VER >= 1700
@@ -42,39 +45,93 @@
#endif // clang >= 3.4
#endif // __clang__
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
- (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
-#define HAS_HASHDJB2_AVX2
-#endif
-
// The following are available for Visual C and GCC:
#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86)))
+ (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
#define HAS_HASHDJB2_SSE41
#define HAS_SUMSQUAREERROR_SSE2
+#define HAS_HAMMINGDISTANCE_SSE42
#endif
// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_HASHDJB2_AVX2
#define HAS_SUMSQUAREERROR_AVX2
#endif
+// The following are available for GCC and clangcl 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_HAMMINGDISTANCE_SSSE3
+#endif
+
+// The following are available for GCC and clangcl 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_HAMMINGDISTANCE_AVX2
+#endif
+
// The following are available for Neon:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
+#define HAS_HAMMINGDISTANCE_NEON
#endif
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_HAMMINGDISTANCE_MSA
+#define HAS_SUMSQUAREERROR_MSA
+#endif
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_HAMMINGDISTANCE_MMI
+#define HAS_SUMSQUAREERROR_MMI
+#endif
+
+uint32_t HammingDistance_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_AVX2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
+
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed);
+uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/convert.h b/files/include/libyuv/convert.h
index f096d19..f571142 100644
--- a/files/include/libyuv/convert.h
+++ b/files/include/libyuv/convert.h
@@ -27,295 +27,433 @@
// Convert I444 to I420.
LIBYUV_API
-int I444ToI420(const uint8* src_y,
+int I444ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// Convert I444 to NV21.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Convert I422 to I420.
LIBYUV_API
-int I422ToI420(const uint8* src_y,
+int I422ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// Convert I422 to NV21.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Copy I420 to I420.
#define I420ToI420 I420Copy
LIBYUV_API
-int I420Copy(const uint8* src_y,
+int I420Copy(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
// Convert I400 (grey) to I420.
LIBYUV_API
-int I400ToI420(const uint8* src_y,
+int I400ToI420(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// Convert I400 (grey) to NV21.
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
#define J400ToJ420 I400ToI420
// Convert NV12 to I420.
LIBYUV_API
-int NV12ToI420(const uint8* src_y,
+int NV12ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert NV21 to I420.
LIBYUV_API
-int NV21ToI420(const uint8* src_y,
+int NV21ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_vu,
+ const uint8_t* src_vu,
int src_stride_vu,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert YUY2 to I420.
LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2,
+int YUY2ToI420(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert UYVY to I420.
LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy,
+int UYVYToI420(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height);
+
// Convert M420 to I420.
LIBYUV_API
-int M420ToI420(const uint8* src_m420,
+int M420ToI420(const uint8_t* src_m420,
int src_stride_m420,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert Android420 to I420.
LIBYUV_API
-int Android420ToI420(const uint8* src_y,
+int Android420ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- int pixel_stride_uv,
- uint8* dst_y,
+ int src_pixel_stride_uv,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// ARGB little endian (bgra in memory) to I420.
LIBYUV_API
-int ARGBToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int ARGBToI420(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// BGRA little endian (argb in memory) to I420.
LIBYUV_API
-int BGRAToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int BGRAToI420(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// ABGR little endian (rgba in memory) to I420.
LIBYUV_API
-int ABGRToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int ABGRToI420(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGBA little endian (abgr in memory) to I420.
LIBYUV_API
-int RGBAToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int RGBAToI420(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB little endian (bgr in memory) to I420.
LIBYUV_API
-int RGB24ToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int RGB24ToI420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB big endian (rgb in memory) to I420.
LIBYUV_API
-int RAWToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int RAWToI420(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to I420.
LIBYUV_API
-int RGB565ToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int RGB565ToI420(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to I420.
LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB12 (R444 fourcc) little endian to I420.
LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_y,
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height);
+
#ifdef HAVE_JPEG
// src_width/height provided by capture.
// dst_width/height for clipping determine final size.
LIBYUV_API
-int MJPGToI420(const uint8* sample,
+int MJPGToI420(const uint8_t* sample,
size_t sample_size,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int src_width,
int src_height,
int dst_width,
int dst_height);
+// JPEG to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height);
+
// Query size of MJPG in pixels.
LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height);
+int MJPGSize(const uint8_t* sample,
+ size_t sample_size,
+ int* width,
+ int* height);
#endif
// Convert camera sample to I420 with cropping, rotation and vertical flip.
@@ -338,16 +476,16 @@
// Must be less than or equal to src_width/src_height
// Cropping parameters are pre-rotation.
// "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
-int ConvertToI420(const uint8* src_frame,
- size_t src_size,
- uint8* dst_y,
+int ConvertToI420(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int crop_x,
int crop_y,
@@ -356,7 +494,7 @@
int crop_width,
int crop_height,
enum RotationMode rotation,
- uint32 format);
+ uint32_t fourcc);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/convert_argb.h b/files/include/libyuv/convert_argb.h
index 6bdfd95..e8ed1f5 100644
--- a/files/include/libyuv/convert_argb.h
+++ b/files/include/libyuv/convert_argb.h
@@ -30,102 +30,167 @@
// Copy ARGB to ARGB.
LIBYUV_API
-int ARGBCopy(const uint8* src_argb,
+int ARGBCopy(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I420 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y,
+int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Duplicate prototype for function in convert_from.h for remoting.
LIBYUV_API
-int I420ToABGR(const uint8* src_y,
+int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I422 to ARGB.
LIBYUV_API
-int I422ToARGB(const uint8* src_y,
+int I422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8* src_y,
+int I444ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J444 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8* src_y,
+int J444ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I444 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8* src_y,
+int I444ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert I420 with Alpha to preattenuated ARGB.
LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y,
+int I420AlphaToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- const uint8* src_a,
+ const uint8_t* src_a,
int src_stride_a,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
@@ -133,15 +198,15 @@
// Convert I420 with Alpha to preattenuated ABGR.
LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y,
+int I420AlphaToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- const uint8* src_a,
+ const uint8_t* src_a,
int src_stride_a,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height,
@@ -149,18 +214,18 @@
// Convert I400 (grey) to ARGB. Reverse of ARGBToI400.
LIBYUV_API
-int I400ToARGB(const uint8* src_y,
+int I400ToARGB(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J400 (jpeg grey) to ARGB.
LIBYUV_API
-int J400ToARGB(const uint8* src_y,
+int J400ToARGB(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
@@ -170,202 +235,322 @@
// Convert NV12 to ARGB.
LIBYUV_API
-int NV12ToARGB(const uint8* src_y,
+int NV12ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV21 to ARGB.
LIBYUV_API
-int NV21ToARGB(const uint8* src_y,
+int NV21ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_vu,
+ const uint8_t* src_vu,
int src_stride_vu,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert NV12 to ABGR.
LIBYUV_API
-int NV12ToABGR(const uint8* src_y,
+int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert NV21 to ABGR.
LIBYUV_API
-int NV21ToABGR(const uint8* src_y,
+int NV21ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
- int src_stride_uv,
- uint8* dst_abgr,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height);
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
// Convert M420 to ARGB.
LIBYUV_API
-int M420ToARGB(const uint8* src_m420,
+int M420ToARGB(const uint8_t* src_m420,
int src_stride_m420,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert YUY2 to ARGB.
LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2,
+int YUY2ToARGB(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert UYVY to ARGB.
LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy,
+int UYVYToARGB(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ARGB.
LIBYUV_API
-int J420ToARGB(const uint8* src_y,
+int J420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J422 to ARGB.
LIBYUV_API
-int J422ToARGB(const uint8* src_y,
+int J422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert J420 to ABGR.
LIBYUV_API
-int J420ToABGR(const uint8* src_y,
+int J420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert J422 to ABGR.
LIBYUV_API
-int J422ToABGR(const uint8* src_y,
+int J422ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H420 to ARGB.
LIBYUV_API
-int H420ToARGB(const uint8* src_y,
+int H420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H422 to ARGB.
LIBYUV_API
-int H422ToARGB(const uint8* src_y,
+int H422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H420 to ABGR.
LIBYUV_API
-int H420ToABGR(const uint8* src_y,
+int H420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert H422 to ABGR.
LIBYUV_API
-int H422ToABGR(const uint8* src_y,
+int H422ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
// BGRA little endian (argb in memory) to ARGB.
LIBYUV_API
-int BGRAToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int BGRAToARGB(const uint8_t* src_bgra,
+ int src_stride_bgra,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// ABGR little endian (rgba in memory) to ARGB.
LIBYUV_API
-int ABGRToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int ABGRToARGB(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGBA little endian (abgr in memory) to ARGB.
LIBYUV_API
-int RGBAToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int RGBAToARGB(const uint8_t* src_rgba,
+ int src_stride_rgba,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
@@ -375,56 +560,88 @@
// RGB little endian (bgr in memory) to ARGB.
LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int RGB24ToARGB(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB big endian (rgb in memory) to ARGB.
LIBYUV_API
-int RAWToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int RAWToARGB(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB16 (RGBP fourcc) little endian to ARGB.
LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int RGB565ToARGB(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB15 (RGBO fourcc) little endian to ARGB.
LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// RGB12 (R444 fourcc) little endian to ARGB.
LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame,
- int src_stride_frame,
- uint8* dst_argb,
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height);
+
#ifdef HAVE_JPEG
// src_width/height provided by capture
// dst_width/height for clipping determine final size.
LIBYUV_API
-int MJPGToARGB(const uint8* sample,
+int MJPGToARGB(const uint8_t* sample,
size_t sample_size,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
@@ -434,34 +651,34 @@
// Convert Android420 to ARGB.
LIBYUV_API
-int Android420ToARGB(const uint8* src_y,
+int Android420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert Android420 to ABGR.
LIBYUV_API
-int Android420ToABGR(const uint8* src_y,
+int Android420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
+// "sample_size" is needed to parse MJPG.
// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
// Normally this would be the same as dst_width, with recommended alignment
// to 16 bytes for better efficiency.
@@ -480,12 +697,12 @@
// Must be less than or equal to src_width/src_height
// Cropping parameters are pre-rotation.
// "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
LIBYUV_API
-int ConvertToARGB(const uint8* src_frame,
- size_t src_size,
- uint8* dst_argb,
+int ConvertToARGB(const uint8_t* sample,
+ size_t sample_size,
+ uint8_t* dst_argb,
int dst_stride_argb,
int crop_x,
int crop_y,
@@ -494,7 +711,7 @@
int crop_width,
int crop_height,
enum RotationMode rotation,
- uint32 format);
+ uint32_t fourcc);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/convert_from.h b/files/include/libyuv/convert_from.h
index 528d8dd..861418d 100644
--- a/files/include/libyuv/convert_from.h
+++ b/files/include/libyuv/convert_from.h
@@ -21,218 +21,257 @@
// See Also convert.h for conversions from formats to I420.
-// I420Copy in convert to I420ToI420.
-
-LIBYUV_API
-int I420ToI422(const uint8* src_y,
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+int I420ToI010(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint16_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint16_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint16_t* dst_v,
int dst_stride_v,
int width,
int height);
LIBYUV_API
-int I420ToI444(const uint8* src_y,
+int I420ToI422(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToI444(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
LIBYUV_API
-int I400Copy(const uint8* src_y,
+int I400Copy(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height);
LIBYUV_API
-int I420ToNV12(const uint8* src_y,
+int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API
-int I420ToNV21(const uint8* src_y,
+int I420ToNV21(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_vu,
+ uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height);
LIBYUV_API
-int I420ToYUY2(const uint8* src_y,
+int I420ToYUY2(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
int width,
int height);
LIBYUV_API
-int I420ToUYVY(const uint8* src_y,
+int I420ToUYVY(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
int width,
int height);
LIBYUV_API
-int I420ToARGB(const uint8* src_y,
+int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
LIBYUV_API
-int I420ToBGRA(const uint8* src_y,
+int I420ToBGRA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
int width,
int height);
LIBYUV_API
-int I420ToABGR(const uint8* src_y,
+int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
- int dst_stride_argb,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
int width,
int height);
LIBYUV_API
-int I420ToRGBA(const uint8* src_y,
+int I420ToRGBA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height);
LIBYUV_API
-int I420ToRGB24(const uint8* src_y,
+int I420ToRGB24(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
int width,
int height);
LIBYUV_API
-int I420ToRAW(const uint8* src_y,
+int I420ToRAW(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
int width,
int height);
LIBYUV_API
-int I420ToRGB565(const uint8* src_y,
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
int width,
int height);
LIBYUV_API
-int J420ToRGB565(const uint8* src_y,
+int J420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
int width,
int height);
LIBYUV_API
-int H420ToRGB565(const uint8* src_y,
+int H420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
int width,
int height);
LIBYUV_API
-int I422ToRGB565(const uint8* src_y,
+int I422ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
int width,
int height);
@@ -241,57 +280,83 @@
// The order of the dither matrix is first byte is upper left.
LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y,
+int I420ToRGB565Dither(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
- const uint8* dither4x4,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const uint8_t* dither4x4,
int width,
int height);
LIBYUV_API
-int I420ToARGB1555(const uint8* src_y,
+int I420ToARGB1555(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_argb1555,
+ int dst_stride_argb1555,
int width,
int height);
LIBYUV_API
-int I420ToARGB4444(const uint8* src_y,
+int I420ToARGB4444(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
+ uint8_t* dst_argb4444,
+ int dst_stride_argb4444,
int width,
int height);
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
// Convert I420 to specified format.
// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
LIBYUV_API
-int ConvertFromI420(const uint8* y,
+int ConvertFromI420(const uint8_t* y,
int y_stride,
- const uint8* u,
+ const uint8_t* u,
int u_stride,
- const uint8* v,
+ const uint8_t* v,
int v_stride,
- uint8* dst_sample,
+ uint8_t* dst_sample,
int dst_sample_stride,
int width,
int height,
- uint32 format);
+ uint32_t fourcc);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/convert_from_argb.h b/files/include/libyuv/convert_from_argb.h
index 50722d7..cbbef6f 100644
--- a/files/include/libyuv/convert_from_argb.h
+++ b/files/include/libyuv/convert_from_argb.h
@@ -21,63 +21,85 @@
// Copy ARGB to ARGB.
#define ARGBToARGB ARGBCopy
LIBYUV_API
-int ARGBCopy(const uint8* src_argb,
+int ARGBCopy(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert ARGB To BGRA.
LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb,
+int ARGBToBGRA(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_bgra,
+ uint8_t* dst_bgra,
int dst_stride_bgra,
int width,
int height);
// Convert ARGB To ABGR.
LIBYUV_API
-int ARGBToABGR(const uint8* src_argb,
+int ARGBToABGR(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height);
// Convert ARGB To RGBA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb,
+int ARGBToRGBA(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height);
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height);
+
// Convert ARGB To RGB24.
LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb,
+int ARGBToRGB24(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb24,
+ uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
// Convert ARGB To RAW.
LIBYUV_API
-int ARGBToRAW(const uint8* src_argb,
+int ARGBToRAW(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb,
- int dst_stride_rgb,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
int width,
int height);
// Convert ARGB To RGB565.
LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb,
+int ARGBToRGB565(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height);
@@ -86,173 +108,184 @@
// Values in dither matrix from 0 to 7 recommended.
// The order of the dither matrix is first byte is upper left.
// TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
+// const uint8_t(*dither)[4][4];
LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb,
+int ARGBToRGB565Dither(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
- const uint8* dither4x4,
+ const uint8_t* dither4x4,
int width,
int height);
// Convert ARGB To ARGB1555.
LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb,
+int ARGBToARGB1555(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb1555,
+ uint8_t* dst_argb1555,
int dst_stride_argb1555,
int width,
int height);
// Convert ARGB To ARGB4444.
LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb,
+int ARGBToARGB4444(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb4444,
+ uint8_t* dst_argb4444,
int dst_stride_argb4444,
int width,
int height);
// Convert ARGB To I444.
LIBYUV_API
-int ARGBToI444(const uint8* src_argb,
+int ARGBToI444(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I422.
LIBYUV_API
-int ARGBToI422(const uint8* src_argb,
+int ARGBToI422(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB To I420. (also in convert.h)
LIBYUV_API
-int ARGBToI420(const uint8* src_argb,
+int ARGBToI420(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
-int ARGBToJ420(const uint8* src_argb,
+int ARGBToJ420(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J422.
LIBYUV_API
-int ARGBToJ422(const uint8* src_argb,
+int ARGBToJ422(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert ARGB to J400. (JPeg full range).
LIBYUV_API
-int ARGBToJ400(const uint8* src_argb,
+int ARGBToJ400(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
int width,
int height);
// Convert ARGB to I400.
LIBYUV_API
-int ARGBToI400(const uint8* src_argb,
+int ARGBToI400(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height);
// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
LIBYUV_API
-int ARGBToG(const uint8* src_argb,
+int ARGBToG(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_g,
+ uint8_t* dst_g,
int dst_stride_g,
int width,
int height);
// Convert ARGB To NV12.
LIBYUV_API
-int ARGBToNV12(const uint8* src_argb,
+int ARGBToNV12(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb,
+int ARGBToNV21(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_vu,
+ uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height);
// Convert ARGB To NV21.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb,
+int ARGBToNV21(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_vu,
+ uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height);
+// Convert ABGR To NV12.
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
// Convert ARGB To YUY2.
LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb,
+int ARGBToYUY2(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yuy2,
+ uint8_t* dst_yuy2,
int dst_stride_yuy2,
int width,
int height);
// Convert ARGB To UYVY.
LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb,
+int ARGBToUYVY(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_uyvy,
+ uint8_t* dst_uyvy,
int dst_stride_uyvy,
int width,
int height);
diff --git a/files/include/libyuv/cpu_id.h b/files/include/libyuv/cpu_id.h
index bcddb32..b01cd25 100644
--- a/files/include/libyuv/cpu_id.h
+++ b/files/include/libyuv/cpu_id.h
@@ -36,44 +36,81 @@
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
-static const int kCpuHasF16C = 0x4000;
-
-// 0x8000 reserved for future X86 flags.
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
// These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
-static const int kCpuHasMSA = 0x40000;
+static const int kCpuHasMIPS = 0x200000;
+static const int kCpuHasMSA = 0x400000;
+static const int kCpuHasMMI = 0x800000;
-// Internal function used to auto-init.
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
LIBYUV_API
int InitCpuFlags(void);
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// Returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+ int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+ int cpu_info = cpu_info_;
+#endif
+ return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
+}
+
// Internal function for parsing /proc/cpuinfo.
LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
-// Detect CPU has SSE2 etc.
-// Test_flag parameter should be one of kCpuHas constants above.
-// returns non-zero if instruction set is detected
-static __inline int TestCpuFlag(int test_flag) {
- LIBYUV_API extern int cpu_info_;
- return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
-}
-
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
// MaskCpuFlags(-1) to enable all cpu specific optimizations.
// MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
LIBYUV_API
-void MaskCpuFlags(int enable_flags);
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+// again.
+// - enabling CPU features that are not supported by the CPU will result in
+// undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+ LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+ __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+ cpu_info_ = cpu_flags;
+#endif
+}
// Low level cpuid for X86. Returns zeros on other CPUs.
// eax is the info type that you want.
// ecx is typically the cpu number, and should normally be zero.
LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/macros_msa.h b/files/include/libyuv/macros_msa.h
index 61be352..29997ce 100644
--- a/files/include/libyuv/macros_msa.h
+++ b/files/include/libyuv/macros_msa.h
@@ -16,38 +16,38 @@
#include <stdint.h>
#if (__mips_isa_rev >= 6)
-#define LW(psrc) \
- ({ \
- uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
- uint32 val_m; \
- asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_lw_m] "m"(*psrc_lw_m)); \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
+ uint32_t val_m; \
+ asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
- uint64 val_m = 0; \
- asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_ld_m] "m"(*psrc_ld_m)); \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
+ uint64_t val_m = 0; \
+ asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
- uint32 val0_m, val1_m; \
- uint64 val_m = 0; \
- val0_m = LW(psrc_ld_m); \
- val1_m = LW(psrc_ld_m + 4); \
- val_m = (uint64)(val1_m); /* NOLINT */ \
- val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
- val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64_t)(val1_m); /* NOLINT */ \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
+ val_m; \
})
#endif // (__mips == 64)
@@ -81,38 +81,38 @@
})
#endif // !(__mips == 64)
#else // !(__mips_isa_rev >= 6)
-#define LW(psrc) \
- ({ \
- uint8* psrc_lw_m = (uint8*)(psrc); /* NOLINT */ \
- uint32 val_m; \
- asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_lw_m] "m"(*psrc_lw_m)); \
- val_m; \
+#define LW(psrc) \
+ ({ \
+ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
+ uint32_t val_m; \
+ asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_lw_m] "m"(*psrc_lw_m)); \
+ val_m; \
})
#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
- uint64 val_m = 0; \
- asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
- : [val_m] "=r"(val_m) \
- : [psrc_ld_m] "m"(*psrc_ld_m)); \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
+ uint64_t val_m = 0; \
+ asm volatile("uld %[val_m], %[psrc_ld_m] \n" \
+ : [val_m] "=r"(val_m) \
+ : [psrc_ld_m] "m"(*psrc_ld_m)); \
+ val_m; \
})
#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- uint8* psrc_ld_m = (uint8*)(psrc); /* NOLINT */ \
- uint32 val0_m, val1_m; \
- uint64 val_m = 0; \
- val0_m = LW(psrc_ld_m); \
- val1_m = LW(psrc_ld_m + 4); \
- val_m = (uint64)(val1_m); /* NOLINT */ \
- val_m = (uint64)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
- val_m = (uint64)(val_m | (uint64)val0_m); /* NOLINT */ \
- val_m; \
+#define LD(psrc) \
+ ({ \
+ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
+ uint32_t val0_m, val1_m; \
+ uint64_t val_m = 0; \
+ val0_m = LW(psrc_ld_m); \
+ val1_m = LW(psrc_ld_m + 4); \
+ val_m = (uint64_t)(val1_m); /* NOLINT */ \
+ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+ val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \
+ val_m; \
})
#endif // (__mips == 64)
@@ -138,7 +138,7 @@
// TODO(fbarchard): Consider removing __VAR_ARGS versions.
#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
@@ -158,14 +158,14 @@
out0 = LD_B(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \
}
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_B2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
/* Description : Store two vectors with stride each having 16 'byte' sized
elements
diff --git a/files/include/libyuv/mjpeg_decoder.h b/files/include/libyuv/mjpeg_decoder.h
index 8a4f282..275f8d4 100644
--- a/files/include/libyuv/mjpeg_decoder.h
+++ b/files/include/libyuv/mjpeg_decoder.h
@@ -26,13 +26,13 @@
extern "C" {
#endif
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
#ifdef __cplusplus
} // extern "C"
#endif
-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
enum JpegSubsamplingType {
kJpegYuv420,
@@ -43,7 +43,7 @@
};
struct Buffer {
- const uint8* data;
+ const uint8_t* data;
int len;
};
@@ -65,7 +65,7 @@
class LIBYUV_API MJpegDecoder {
public:
typedef void (*CallbackFunction)(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows);
@@ -85,7 +85,7 @@
// If return value is LIBYUV_TRUE, then the values for all the following
// getters are populated.
// src_len is the size of the compressed mjpeg frame in bytes.
- LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+ LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
// Returns width of the last loaded frame in pixels.
int GetWidth();
@@ -138,7 +138,7 @@
// at least GetComponentSize(i). The pointers in planes are incremented
// to point to after the end of the written data.
// TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
- LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+ LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
// Decodes the entire image and passes the data via repeated calls to a
// callback function. Each call will get the data for a whole number of
@@ -162,7 +162,7 @@
LIBYUV_BOOL StartDecode();
LIBYUV_BOOL FinishDecode();
- void SetScanlinePointers(uint8** data);
+ void SetScanlinePointers(uint8_t** data);
LIBYUV_BOOL DecodeImcuRow();
int GetComponentScanlinePadding(int component);
@@ -181,11 +181,11 @@
// Temporaries used to point to scanline outputs.
int num_outbufs_; // Outermost size of all arrays below.
- uint8*** scanlines_;
+ uint8_t*** scanlines_;
int* scanlines_sizes_;
// Temporary buffer used for decoding when we can't decode directly to the
// output buffers. Large enough for just one iMCU row.
- uint8** databuf_;
+ uint8_t** databuf_;
int* databuf_strides_;
};
diff --git a/files/include/libyuv/planar_functions.h b/files/include/libyuv/planar_functions.h
index 040839c..f6f5b3e 100644
--- a/files/include/libyuv/planar_functions.h
+++ b/files/include/libyuv/planar_functions.h
@@ -22,705 +22,10 @@
extern "C" {
#endif
-// Copy a plane of data.
-LIBYUV_API
-void CopyPlane(const uint8* src_y,
- int src_stride_y,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-LIBYUV_API
-void CopyPlane_16(const uint16* src_y,
- int src_stride_y,
- uint16* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-// Set a plane of data to a 32 bit value.
-LIBYUV_API
-void SetPlane(uint8* dst_y,
- int dst_stride_y,
- int width,
- int height,
- uint32 value);
-
-// Split interleaved UV plane into separate U and V planes.
-LIBYUV_API
-void SplitUVPlane(const uint8* src_uv,
- int src_stride_uv,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Merge separate U and V planes into one interleaved UV plane.
-LIBYUV_API
-void MergeUVPlane(const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_uv,
- int dst_stride_uv,
- int width,
- int height);
-
-// Copy I400. Supports inverting.
-LIBYUV_API
-int I400ToI400(const uint8* src_y,
- int src_stride_y,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-#define J400ToJ400 I400ToI400
-
-// Copy I422 to I422.
-#define I422ToI422 I422Copy
-LIBYUV_API
-int I422Copy(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Copy I444 to I444.
-#define I444ToI444 I444Copy
-LIBYUV_API
-int I444Copy(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert YUY2 to I422.
-LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2,
- int src_stride_yuy2,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Convert UYVY to I422.
-LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy,
- int src_stride_uyvy,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2,
- int src_stride_yuy2,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_uv,
- int dst_stride_uv,
- int width,
- int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy,
- int src_stride_uyvy,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_uv,
- int dst_stride_uv,
- int width,
- int height);
-
-LIBYUV_API
-int YUY2ToY(const uint8* src_yuy2,
- int src_stride_yuy2,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-// Convert I420 to I400. (calls CopyPlane ignoring u/v).
-LIBYUV_API
-int I420ToI400(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-// Alias
-#define J420ToJ400 I420ToI400
-#define I420ToI420Mirror I420Mirror
-
-// I420 mirror.
-LIBYUV_API
-int I420Mirror(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Alias
-#define I400ToI400Mirror I400Mirror
-
-// I400 mirror. A single plane is mirrored horizontally.
-// Pass negative height to achieve 180 degree rotation.
-LIBYUV_API
-int I400Mirror(const uint8* src_y,
- int src_stride_y,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-// Alias
-#define ARGBToARGBMirror ARGBMirror
-
-// ARGB mirror.
-LIBYUV_API
-int ARGBMirror(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert NV12 to RGB565.
-LIBYUV_API
-int NV12ToRGB565(const uint8* src_y,
- int src_stride_y,
- const uint8* src_uv,
- int src_stride_uv,
- uint8* dst_rgb565,
- int dst_stride_rgb565,
- int width,
- int height);
-
-// I422ToARGB is in convert_argb.h
-// Convert I422 to BGRA.
-LIBYUV_API
-int I422ToBGRA(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_bgra,
- int dst_stride_bgra,
- int width,
- int height);
-
-// Convert I422 to ABGR.
-LIBYUV_API
-int I422ToABGR(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_abgr,
- int dst_stride_abgr,
- int width,
- int height);
-
-// Convert I422 to RGBA.
-LIBYUV_API
-int I422ToRGBA(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_rgba,
- int dst_stride_rgba,
- int width,
- int height);
-
-// Alias
-#define RGB24ToRAW RAWToRGB24
-
-LIBYUV_API
-int RAWToRGB24(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_rgb24,
- int dst_stride_rgb24,
- int width,
- int height);
-
-// Draw a rectangle into I420.
-LIBYUV_API
-int I420Rect(uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int x,
- int y,
- int width,
- int height,
- int value_y,
- int value_u,
- int value_v);
-
-// Draw a rectangle into ARGB.
-LIBYUV_API
-int ARGBRect(uint8* dst_argb,
- int dst_stride_argb,
- int x,
- int y,
- int width,
- int height,
- uint32 value);
-
-// Convert ARGB to gray scale ARGB.
-LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Make a rectangle of ARGB gray scale.
-LIBYUV_API
-int ARGBGray(uint8* dst_argb,
- int dst_stride_argb,
- int x,
- int y,
- int width,
- int height);
-
-// Make a rectangle of ARGB Sepia tone.
-LIBYUV_API
-int ARGBSepia(uint8* dst_argb,
- int dst_stride_argb,
- int x,
- int y,
- int width,
- int height);
-
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The next 4 coefficients apply to B, G, R, A and produce R of the output.
-// The last 4 coefficients apply to B, G, R, A and produce A of the output.
-LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- const int8* matrix_argb,
- int width,
- int height);
-
-// Deprecated. Use ARGBColorMatrix instead.
-// Apply a matrix rotation to each ARGB pixel.
-// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
-// The first 4 coefficients apply to B, G, R, A and produce B of the output.
-// The next 4 coefficients apply to B, G, R, A and produce G of the output.
-// The last 4 coefficients apply to B, G, R, A and produce R of the output.
-LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb,
- int dst_stride_argb,
- const int8* matrix_rgb,
- int x,
- int y,
- int width,
- int height);
-
-// Apply a color table each ARGB pixel.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int ARGBColorTable(uint8* dst_argb,
- int dst_stride_argb,
- const uint8* table_argb,
- int x,
- int y,
- int width,
- int height);
-
-// Apply a color table each ARGB pixel but preserve destination alpha.
-// Table contains 256 ARGB values.
-LIBYUV_API
-int RGBColorTable(uint8* dst_argb,
- int dst_stride_argb,
- const uint8* table_argb,
- int x,
- int y,
- int width,
- int height);
-
-// Apply a luma/color table each ARGB pixel but preserve destination alpha.
-// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
-// RGB (YJ style) and C is an 8 bit color component (R, G or B).
-LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- const uint8* luma_rgb_table,
- int width,
- int height);
-
-// Apply a 3 term polynomial to ARGB values.
-// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
-// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
-// g squared, r squared and a squared. The 4rd row is coefficients for b to
-// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
-// result clamped to 0 to 255.
-// A polynomial approximation can be dirived using software such as 'R'.
-
-LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- const float* poly,
- int width,
- int height);
-
-// Convert plane of 16 bit shorts to half floats.
-// Source values are multiplied by scale before storing as half float.
-LIBYUV_API
-int HalfFloatPlane(const uint16* src_y,
- int src_stride_y,
- uint16* dst_y,
- int dst_stride_y,
- float scale,
- int width,
- int height);
-
-// Quantize a rectangle of ARGB. Alpha unaffected.
-// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
-// interval_size should be a value between 1 and 255.
-// interval_offset should be a value between 0 and 255.
-LIBYUV_API
-int ARGBQuantize(uint8* dst_argb,
- int dst_stride_argb,
- int scale,
- int interval_size,
- int interval_offset,
- int x,
- int y,
- int width,
- int height);
-
-// Copy ARGB to ARGB.
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Copy Alpha channel of ARGB to alpha of ARGB.
-LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Extract the alpha channel from ARGB.
-LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_a,
- int dst_stride_a,
- int width,
- int height);
-
-// Copy Y channel to Alpha of ARGB.
-LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y,
- int src_stride_y,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-typedef void (*ARGBBlendRow)(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
- int width);
-
-// Get function to Alpha Blend ARGB pixels and store to destination.
-LIBYUV_API
-ARGBBlendRow GetARGBBlend();
-
-// Alpha Blend ARGB images and store to destination.
-// Source is pre-multiplied by alpha using ARGBAttenuate.
-// Alpha of destination is set to 255.
-LIBYUV_API
-int ARGBBlend(const uint8* src_argb0,
- int src_stride_argb0,
- const uint8* src_argb1,
- int src_stride_argb1,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Alpha Blend plane and store to destination.
-// Source is not pre-multiplied by alpha.
-LIBYUV_API
-int BlendPlane(const uint8* src_y0,
- int src_stride_y0,
- const uint8* src_y1,
- int src_stride_y1,
- const uint8* alpha,
- int alpha_stride,
- uint8* dst_y,
- int dst_stride_y,
- int width,
- int height);
-
-// Alpha Blend YUV images and store to destination.
-// Source is not pre-multiplied by alpha.
-// Alpha is full width x height and subsampled to half size to apply to UV.
-LIBYUV_API
-int I420Blend(const uint8* src_y0,
- int src_stride_y0,
- const uint8* src_u0,
- int src_stride_u0,
- const uint8* src_v0,
- int src_stride_v0,
- const uint8* src_y1,
- int src_stride_y1,
- const uint8* src_u1,
- int src_stride_u1,
- const uint8* src_v1,
- int src_stride_v1,
- const uint8* alpha,
- int alpha_stride,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height);
-
-// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
-LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0,
- int src_stride_argb0,
- const uint8* src_argb1,
- int src_stride_argb1,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Add ARGB image with ARGB image. Saturates to 255.
-LIBYUV_API
-int ARGBAdd(const uint8* src_argb0,
- int src_stride_argb0,
- const uint8* src_argb1,
- int src_stride_argb1,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
-LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0,
- int src_stride_argb0,
- const uint8* src_argb1,
- int src_stride_argb1,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert I422 to YUY2.
-LIBYUV_API
-int I422ToYUY2(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
- int width,
- int height);
-
-// Convert I422 to UYVY.
-LIBYUV_API
-int I422ToUYVY(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_frame,
- int dst_stride_frame,
- int width,
- int height);
-
-// Convert unattentuated ARGB to preattenuated ARGB.
-LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Convert preattentuated ARGB to unattenuated ARGB.
-LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height);
-
-// Internal function - do not call directly.
-// Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry. Used by ARGBBlur.
-LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb,
- int src_stride_argb,
- int32* dst_cumsum,
- int dst_stride32_cumsum,
- int width,
- int height);
-
-// Blur ARGB image.
-// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
-// 16 byte boundary.
-// dst_stride32_cumsum is number of ints in a row (width * 4).
-// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
-// Blur is optimized for radius of 5 (11x11) or less.
-LIBYUV_API
-int ARGBBlur(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int32* dst_cumsum,
- int dst_stride32_cumsum,
- int width,
- int height,
- int radius);
-
-// Multiply ARGB image by ARGB value.
-LIBYUV_API
-int ARGBShade(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height,
- uint32 value);
-
-// Interpolate between two images using specified amount of interpolation
-// (0 to 255) and store to destination.
-// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
-// and 255 means 1% src0 and 99% src1.
-LIBYUV_API
-int InterpolatePlane(const uint8* src0,
- int src_stride0,
- const uint8* src1,
- int src_stride1,
- uint8* dst,
- int dst_stride,
- int width,
- int height,
- int interpolation);
-
-// Interpolate between two ARGB images using specified amount of interpolation
-// Internally calls InterpolatePlane with width * 4 (bpp).
-LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0,
- int src_stride_argb0,
- const uint8* src_argb1,
- int src_stride_argb1,
- uint8* dst_argb,
- int dst_stride_argb,
- int width,
- int height,
- int interpolation);
-
-// Interpolate between two YUV images using specified amount of interpolation
-// Internally calls InterpolatePlane on each plane where the U and V planes
-// are half width and half height.
-LIBYUV_API
-int I420Interpolate(const uint8* src0_y,
- int src0_stride_y,
- const uint8* src0_u,
- int src0_stride_u,
- const uint8* src0_v,
- int src0_stride_v,
- const uint8* src1_y,
- int src1_stride_y,
- const uint8* src1_u,
- int src1_stride_u,
- const uint8* src1_v,
- int src1_stride_v,
- uint8* dst_y,
- int dst_stride_y,
- uint8* dst_u,
- int dst_stride_u,
- uint8* dst_v,
- int dst_stride_v,
- int width,
- int height,
- int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
- (defined(__i386__) && !defined(__SSE2__))
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
@@ -735,55 +40,814 @@
#define HAS_ARGBAFFINEROW_SSE2
#endif
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height);
+
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int scale, // 1024 for 10 bits
+ int width,
+ int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height,
+ uint32_t value);
+
+// Split interleaved UV plane into separate U and V planes.
+LIBYUV_API
+void SplitUVPlane(const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Merge separate U and V planes into one interleaved UV plane.
+LIBYUV_API
+void MergeUVPlane(const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height);
+
+// Copy I400. Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+#define J400ToJ400 I400ToI400
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define J420ToJ400 I420ToI400
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror. A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ int width,
+ int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_bgra,
+ int dst_stride_bgra,
+ int width,
+ int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgba,
+ int dst_stride_rgba,
+ int width,
+ int height);
+
+// Alias
+#define RGB24ToRAW RAWToRGB24
+
+LIBYUV_API
+int RAWToRGB24(const uint8_t* src_raw,
+ int src_stride_raw,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int x,
+ int y,
+ int width,
+ int height,
+ int value_y,
+ int value_u,
+ int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height,
+ uint32_t value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_argb,
+ int width,
+ int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int RGBColorMatrix(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const int8_t* matrix_rgb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* table_argb,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const uint8_t* luma,
+ int width,
+ int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
+// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared. The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const float* poly,
+ int width,
+ int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ float scale,
+ int width,
+ int height);
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8_t* dst_argb,
+ int dst_stride_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int dst_x,
+ int dst_y,
+ int width,
+ int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Copy Alpha channel of ARGB to alpha of ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Extract the alpha channel from ARGB.
+LIBYUV_API
+int ARGBExtractAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ int width,
+ int height);
+
+// Copy Y channel to Alpha of ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Source is pre-multiplied by alpha using ARGBAttenuate.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Alpha Blend plane and store to destination.
+// Source is not pre-multiplied by alpha.
+LIBYUV_API
+int BlendPlane(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int width,
+ int height);
+
+// Alpha Blend YUV images and store to destination.
+// Source is not pre-multiplied by alpha.
+// Alpha is full width x height and subsampled to half size to apply to UV.
+LIBYUV_API
+int I420Blend(const uint8_t* src_y0,
+ int src_stride_y0,
+ const uint8_t* src_u0,
+ int src_stride_u0,
+ const uint8_t* src_v0,
+ int src_stride_v0,
+ const uint8_t* src_y1,
+ int src_stride_y1,
+ const uint8_t* src_u1,
+ int src_stride_u1,
+ const uint8_t* src_v1,
+ int src_stride_v1,
+ const uint8_t* alpha,
+ int alpha_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_yuy2,
+ int dst_stride_yuy2,
+ int width,
+ int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_uyvy,
+ int dst_stride_uyvy,
+ int width,
+ int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height);
+
+// Internal function - do not call directly.
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+ int src_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height);
+
+// Blur ARGB image.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+// 16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
+LIBYUV_API
+int ARGBBlur(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int32_t* dst_cumsum,
+ int dst_stride32_cumsum,
+ int width,
+ int height,
+ int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ uint32_t value);
+
+// Interpolate between two images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
+// and 255 means 1% src0 and 99% src1.
+LIBYUV_API
+int InterpolatePlane(const uint8_t* src0,
+ int src_stride0,
+ const uint8_t* src1,
+ int src_stride1,
+ uint8_t* dst,
+ int dst_stride,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// Internally calls InterpolatePlane with width * 4 (bpp).
+LIBYUV_API
+int ARGBInterpolate(const uint8_t* src_argb0,
+ int src_stride_argb0,
+ const uint8_t* src_argb1,
+ int src_stride_argb1,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height,
+ int interpolation);
+
+// Interpolate between two YUV images using specified amount of interpolation
+// Internally calls InterpolatePlane on each plane where the U and V planes
+// are half width and half height.
+LIBYUV_API
+int I420Interpolate(const uint8_t* src0_y,
+ int src0_stride_y,
+ const uint8_t* src0_u,
+ int src0_stride_u,
+ const uint8_t* src0_v,
+ int src0_stride_v,
+ const uint8_t* src1_y,
+ int src1_stride_y,
+ const uint8_t* src1_u,
+ int src1_stride_u,
+ const uint8_t* src1_v,
+ int src1_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ int interpolation);
+
// Row function for copying pixels from a source with a slope to a row
// of destination. Useful for scaling, rotation, mirror, texture mapping.
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb,
+void ARGBAffineRow_C(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb,
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width);
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
// shuffler is 16 bytes and must be aligned.
LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra,
+int ARGBShuffle(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- const uint8* shuffler,
+ const uint8_t* shuffler,
int width,
int height);
// Sobel ARGB effect with planar output.
LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb,
+int ARGBSobelToPlane(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height);
// Sobel ARGB effect.
LIBYUV_API
-int ARGBSobel(const uint8* src_argb,
+int ARGBSobel(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb,
+int ARGBSobelXY(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
diff --git a/files/include/libyuv/rotate.h b/files/include/libyuv/rotate.h
index b9f7154..c64e021 100644
--- a/files/include/libyuv/rotate.h
+++ b/files/include/libyuv/rotate.h
@@ -33,79 +33,97 @@
// Rotate I420 frame.
LIBYUV_API
-int I420Rotate(const uint8* src_y,
+int I420Rotate(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
- int src_width,
- int src_height,
+ int width,
+ int height,
+ enum RotationMode mode);
+
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
enum RotationMode mode);
// Rotate NV12 input and store in I420.
LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y,
+int NV12ToI420Rotate(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
- int src_width,
- int src_height,
+ int width,
+ int height,
enum RotationMode mode);
// Rotate a plane by 0, 90, 180, or 270.
LIBYUV_API
-int RotatePlane(const uint8* src,
+int RotatePlane(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
- int src_width,
- int src_height,
+ int width,
+ int height,
enum RotationMode mode);
// Rotate planes by 90, 180, 270. Deprecated.
LIBYUV_API
-void RotatePlane90(const uint8* src,
+void RotatePlane90(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
-void RotatePlane180(const uint8* src,
+void RotatePlane180(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
-void RotatePlane270(const uint8* src,
+void RotatePlane270(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
-void RotateUV90(const uint8* src,
+void RotateUV90(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height);
@@ -115,21 +133,21 @@
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
-void RotateUV180(const uint8* src,
+void RotateUV180(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height);
LIBYUV_API
-void RotateUV270(const uint8* src,
+void RotateUV270(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height);
@@ -139,19 +157,19 @@
// order will result in a rotation by +- 90 degrees.
// Deprecated.
LIBYUV_API
-void TransposePlane(const uint8* src,
+void TransposePlane(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height);
LIBYUV_API
-void TransposeUV(const uint8* src,
+void TransposeUV(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height);
diff --git a/files/include/libyuv/rotate_argb.h b/files/include/libyuv/rotate_argb.h
index be0190c..2043294 100644
--- a/files/include/libyuv/rotate_argb.h
+++ b/files/include/libyuv/rotate_argb.h
@@ -21,9 +21,9 @@
// Rotate ARGB frame
LIBYUV_API
-int ARGBRotate(const uint8* src_argb,
+int ARGBRotate(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int src_width,
int src_height,
diff --git a/files/include/libyuv/rotate_row.h b/files/include/libyuv/rotate_row.h
index 2c51584..022293e 100644
--- a/files/include/libyuv/rotate_row.h
+++ b/files/include/libyuv/rotate_row.h
@@ -18,10 +18,14 @@
extern "C" {
#endif
-#if defined(__pnacl__) || defined(__CLR_VER) || \
- (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
@@ -29,194 +33,185 @@
#endif
#endif
// The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__i386__) || \
- (defined(__x86_64__) && !defined(__native_client__)))
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSEWX8_SSSE3
#endif
-// The following are available for 64 bit GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
- defined(__x86_64__)
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
#define HAS_TRANSPOSEWX8_FAST_SSSE3
#define HAS_TRANSPOSEUVWX8_SSE2
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON
#endif
-#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
- defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_DSPR2
-#define HAS_TRANSPOSEUVWX8_DSPR2
-#endif // defined(__mips__)
-
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_TRANSPOSEWX16_MSA
#define HAS_TRANSPOSEUVWX16_MSA
#endif
-void TransposeWxH_C(const uint8* src,
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_TRANSPOSEWX8_MMI
+#define HAS_TRANSPOSEUVWX8_MMI
+#endif
+
+void TransposeWxH_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height);
-void TransposeWx8_C(const uint8* src,
+void TransposeWx8_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx16_C(const uint8* src,
+void TransposeWx16_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_NEON(const uint8* src,
+void TransposeWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_SSSE3(const uint8* src,
+void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src,
+void TransposeWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width);
-void TransposeWx16_MSA(const uint8* src,
+void TransposeWx16_MSA(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Any_NEON(const uint8* src,
+void TransposeWx8_Any_NEON(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Any_SSSE3(const uint8* src,
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src,
+void TransposeWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeWx8_Any_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
- int width);
-void TransposeWx16_Any_MSA(const uint8* src,
+void TransposeWx16_Any_MSA(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width);
-void TransposeUVWxH_C(const uint8* src,
+void TransposeUVWxH_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height);
-void TransposeUVWx8_C(const uint8* src,
+void TransposeUVWx8_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx16_C(const uint8* src,
+void TransposeUVWx16_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_SSE2(const uint8* src,
+void TransposeUVWx8_SSE2(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_NEON(const uint8* src,
+void TransposeUVWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst_a,
- int dst_stride_a,
- uint8* dst_b,
- int dst_stride_b,
- int width);
-void TransposeUVWx16_MSA(const uint8* src,
+void TransposeUVWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_Any_SSE2(const uint8* src,
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_Any_NEON(const uint8* src,
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src,
- int src_stride,
- uint8* dst_a,
- int dst_stride_a,
- uint8* dst_b,
- int dst_stride_b,
- int width);
-void TransposeUVWx16_Any_MSA(const uint8* src,
+void TransposeUVWx8_Any_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width);
diff --git a/files/include/libyuv/row.h b/files/include/libyuv/row.h
index 3e5dd20..9bb4885 100644
--- a/files/include/libyuv/row.h
+++ b/files/include/libyuv/row.h
@@ -20,34 +20,20 @@
extern "C" {
#endif
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
-
-#define align_buffer_64(var, size) \
- uint8* var##_mem = (uint8*)(malloc((size) + 63)); /* NOLINT */ \
- uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
-
-#define free_aligned_buffer_64(var) \
- free(var##_mem); \
- var = 0
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
- (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
-// True if compiling for SSSE3 as a requirement.
-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
-#define LIBYUV_SSSE3_ONLY
-#endif
-
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
// clang >= 3.5.0 required for Arm64.
#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -69,6 +55,15 @@
#endif // clang >= 3.4
#endif // __clang__
+// clang >= 6.0.0 required for AVX512.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+// clang in xcode follows a different versioning scheme.
+// TODO(fbarchard): fix xcode 9 ios b/789.
+#if (__clang_major__ >= 7) && !defined(__APPLE__)
+#define CLANG_HAS_AVX512 1
+#endif // clang >= 7
+#endif // __clang__
+
// Visual C 2012 required for AVX2.
#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
_MSC_VER >= 1700
@@ -85,7 +80,6 @@
#define HAS_ARGB4444TOARGBROW_SSE2
#define HAS_ARGBEXTRACTALPHAROW_SSE2
#define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2
@@ -120,8 +114,10 @@
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB24ROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB24ROW_SSSE3
#define HAS_RAWTOARGBROW_SSSE3
#define HAS_RAWTORGB24ROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
@@ -184,7 +180,6 @@
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
defined(GCC_HAS_AVX2))
@@ -216,8 +211,10 @@
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB24ROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB24ROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
@@ -246,7 +243,7 @@
// The following are available for AVX2 Visual C and clangcl 32 bit:
// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
#define HAS_ARGB1555TOARGBROW_AVX2
#define HAS_ARGB4444TOARGBROW_AVX2
@@ -264,6 +261,53 @@
#define HAS_I422TOARGBROW_SSSE3
#endif
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_CONVERT16TO8ROW_SSSE3
+#define HAS_CONVERT8TO16ROW_SSE2
+// I210 is for H010. 2 = 422. I for 601 vs H for 709.
+#define HAS_I210TOAR30ROW_SSSE3
+#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTORAWROW_AVX2
+#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_I210TOAR30ROW_AVX2
+#define HAS_I210TOARGBROW_AVX2
+#define HAS_I422TOAR30ROW_AVX2
+#define HAS_I422TOUYVYROW_AVX2
+#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
+#endif
+
+// The following are available for AVX512 clang x86 platforms:
+// TODO(fbarchard): Port to GCC and Visual C
+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+ (defined(CLANG_HAS_AVX512))
+#define HAS_ARGBTORGB24ROW_AVX512VBMI
+#endif
+
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -288,8 +332,12 @@
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
#define HAS_COPYROW_NEON
#define HAS_HALFFLOATROW_NEON
#define HAS_I400TOARGBROW_NEON
@@ -308,8 +356,11 @@
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB24ROW_NEON
#define HAS_NV12TORGB565ROW_NEON
#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
#define HAS_RAWTOARGBROW_NEON
#define HAS_RAWTORGB24ROW_NEON
#define HAS_RAWTOUVROW_NEON
@@ -323,7 +374,9 @@
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
+#define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON
+#define HAS_UVToVUROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
@@ -354,103 +407,163 @@
#define HAS_SOBELYROW_NEON
#endif
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
- (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-#define HAS_COPYROW_MIPS
-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOARGBROW_DSPR2
-#define HAS_INTERPOLATEROW_DSPR2
-#define HAS_MIRRORROW_DSPR2
-#define HAS_MIRRORUVROW_DSPR2
-#define HAS_SPLITUVROW_DSPR2
-#define HAS_RGB24TOARGBROW_DSPR2
-#define HAS_RAWTOARGBROW_DSPR2
-#define HAS_RGB565TOARGBROW_DSPR2
-#define HAS_ARGB1555TOARGBROW_DSPR2
-#define HAS_ARGB4444TOARGBROW_DSPR2
-#define HAS_I444TOARGBROW_DSPR2
-#define HAS_I422TOARGB4444ROW_DSPR2
-#define HAS_I422TOARGB1555ROW_DSPR2
-#define HAS_NV12TOARGBROW_DSPR2
-#define HAS_BGRATOUVROW_DSPR2
-#define HAS_BGRATOYROW_DSPR2
-#define HAS_ABGRTOUVROW_DSPR2
-#define HAS_ARGBTOYROW_DSPR2
-#define HAS_ABGRTOYROW_DSPR2
-#define HAS_RGBATOUVROW_DSPR2
-#define HAS_RGBATOYROW_DSPR2
-#define HAS_ARGBTOUVROW_DSPR2
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_FLOATDIVTOBYTEROW_NEON
+#define HAS_SCALESUMSAMPLES_NEON
#endif
-#endif
-
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
-#define HAS_ARGBMIRRORROW_MSA
-#define HAS_I422TOUYVYROW_MSA
-#define HAS_I422TOYUY2ROW_MSA
-#define HAS_MIRRORROW_MSA
-#define HAS_UYVYTOUVROW_MSA
-#define HAS_UYVYTOYROW_MSA
-#define HAS_YUY2TOUV422ROW_MSA
-#define HAS_YUY2TOUVROW_MSA
-#define HAS_YUY2TOYROW_MSA
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
#define HAS_ARGB4444TOARGBROW_MSA
-#define HAS_ARGBTOYROW_MSA
-#define HAS_ARGBTOUVROW_MSA
-#define HAS_I422TOARGBROW_MSA
-#define HAS_I422TORGBAROW_MSA
-#define HAS_I422ALPHATOARGBROW_MSA
-#define HAS_I422TORGB24ROW_MSA
-#define HAS_ARGBTORGB24ROW_MSA
-#define HAS_ARGBTORAWROW_MSA
-#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBBLENDROW_MSA
+#define HAS_ARGBCOLORMATRIXROW_MSA
+#define HAS_ARGBEXTRACTALPHAROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBQUANTIZEROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
#define HAS_ARGBTOARGB1555ROW_MSA
#define HAS_ARGBTOARGB4444ROW_MSA
-#define HAS_ARGBTOUV444ROW_MSA
-#define HAS_ARGBMULTIPLYROW_MSA
-#define HAS_ARGBADDROW_MSA
-#define HAS_ARGBSUBTRACTROW_MSA
-#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
#define HAS_ARGBTORGB565DITHERROW_MSA
-#define HAS_ARGBSHUFFLEROW_MSA
-#define HAS_ARGBSHADEROW_MSA
-#define HAS_ARGBGRAYROW_MSA
-#define HAS_ARGBSEPIAROW_MSA
-#define HAS_ARGB1555TOARGBROW_MSA
-#define HAS_RGB565TOARGBROW_MSA
-#define HAS_RGB24TOARGBROW_MSA
-#define HAS_RAWTOARGBROW_MSA
-#define HAS_ARGB1555TOYROW_MSA
-#define HAS_RGB565TOYROW_MSA
-#define HAS_RGB24TOYROW_MSA
-#define HAS_RAWTOYROW_MSA
-#define HAS_ARGB1555TOUVROW_MSA
-#define HAS_RGB565TOUVROW_MSA
-#define HAS_RGB24TOUVROW_MSA
-#define HAS_RAWTOUVROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_HALFFLOATROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_MERGEUVROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORUVROW_MSA
#define HAS_NV12TOARGBROW_MSA
#define HAS_NV12TORGB565ROW_MSA
#define HAS_NV21TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_SETROW_MSA
#define HAS_SOBELROW_MSA
#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXROW_MSA
#define HAS_SOBELXYROW_MSA
-#define HAS_ARGBTOYJROW_MSA
-#define HAS_BGRATOYROW_MSA
-#define HAS_ABGRTOYROW_MSA
-#define HAS_RGBATOYROW_MSA
-#define HAS_ARGBTOUVJROW_MSA
-#define HAS_BGRATOUVROW_MSA
-#define HAS_ABGRTOUVROW_MSA
-#define HAS_RGBATOUVROW_MSA
-#define HAS_I444TOARGBROW_MSA
-#define HAS_I400TOARGBROW_MSA
-#define HAS_J400TOARGBROW_MSA
-#define HAS_YUY2TOARGBROW_MSA
+#define HAS_SOBELYROW_MSA
+#define HAS_SPLITUVROW_MSA
#define HAS_UYVYTOARGBROW_MSA
-#define HAS_INTERPOLATEROW_MSA
-#define HAS_ARGBSETROW_MSA
-#define HAS_RAWTORGB24ROW_MSA
-#define HAS_MERGEUVROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_ABGRTOUVROW_MMI
+#define HAS_ABGRTOYROW_MMI
+#define HAS_ARGB1555TOARGBROW_MMI
+#define HAS_ARGB1555TOUVROW_MMI
+#define HAS_ARGB1555TOYROW_MMI
+#define HAS_ARGB4444TOARGBROW_MMI
+#define HAS_ARGB4444TOUVROW_MMI
+#define HAS_ARGB4444TOYROW_MMI
+#define HAS_ARGBADDROW_MMI
+#define HAS_ARGBATTENUATEROW_MMI
+#define HAS_ARGBBLENDROW_MMI
+#define HAS_ARGBCOLORMATRIXROW_MMI
+#define HAS_ARGBCOPYALPHAROW_MMI
+#define HAS_ARGBCOPYYTOALPHAROW_MMI
+#define HAS_ARGBEXTRACTALPHAROW_MMI
+#define HAS_ARGBGRAYROW_MMI
+#define HAS_ARGBMIRRORROW_MMI
+#define HAS_ARGBMULTIPLYROW_MMI
+#define HAS_ARGBSEPIAROW_MMI
+#define HAS_ARGBSHADEROW_MMI
+#define HAS_ARGBSHUFFLEROW_MMI
+#define HAS_ARGBSUBTRACTROW_MMI
+#define HAS_ARGBTOARGB1555ROW_MMI
+#define HAS_ARGBTOARGB4444ROW_MMI
+#define HAS_ARGBTORAWROW_MMI
+#define HAS_ARGBTORGB24ROW_MMI
+#define HAS_ARGBTORGB565DITHERROW_MMI
+#define HAS_ARGBTORGB565ROW_MMI
+#define HAS_ARGBTOUV444ROW_MMI
+#define HAS_ARGBTOUVJROW_MMI
+#define HAS_ARGBTOUVROW_MMI
+#define HAS_ARGBTOYJROW_MMI
+#define HAS_ARGBTOYROW_MMI
+#define HAS_BGRATOUVROW_MMI
+#define HAS_BGRATOYROW_MMI
+#define HAS_BLENDPLANEROW_MMI
+#define HAS_COMPUTECUMULATIVESUMROW_MMI
+#define HAS_CUMULATIVESUMTOAVERAGEROW_MMI
+#define HAS_HALFFLOATROW_MMI
+#define HAS_I400TOARGBROW_MMI
+#define HAS_I422TOUYVYROW_MMI
+#define HAS_I422TOYUY2ROW_MMI
+#define HAS_INTERPOLATEROW_MMI
+#define HAS_J400TOARGBROW_MMI
+#define HAS_MERGERGBROW_MMI
+#define HAS_MERGEUVROW_MMI
+#define HAS_MIRRORROW_MMI
+#define HAS_MIRRORUVROW_MMI
+#define HAS_RAWTOARGBROW_MMI
+#define HAS_RAWTORGB24ROW_MMI
+#define HAS_RAWTOUVROW_MMI
+#define HAS_RAWTOYROW_MMI
+#define HAS_RGB24TOARGBROW_MMI
+#define HAS_RGB24TOUVROW_MMI
+#define HAS_RGB24TOYROW_MMI
+#define HAS_RGB565TOARGBROW_MMI
+#define HAS_RGB565TOUVROW_MMI
+#define HAS_RGB565TOYROW_MMI
+#define HAS_RGBATOUVROW_MMI
+#define HAS_RGBATOYROW_MMI
+#define HAS_SOBELROW_MMI
+#define HAS_SOBELTOPLANEROW_MMI
+#define HAS_SOBELXROW_MMI
+#define HAS_SOBELXYROW_MMI
+#define HAS_SOBELYROW_MMI
+#define HAS_SPLITRGBROW_MMI
+#define HAS_SPLITUVROW_MMI
+#define HAS_UYVYTOUVROW_MMI
+#define HAS_UYVYTOYROW_MMI
+#define HAS_YUY2TOUV422ROW_MMI
+#define HAS_YUY2TOUVROW_MMI
+#define HAS_YUY2TOYROW_MMI
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -459,18 +572,18 @@
#else
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#endif
-typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint16 uvec16[8];
-typedef __declspec(align(16)) uint32 uvec32[4];
-typedef __declspec(align(16)) uint8 uvec8[16];
-typedef __declspec(align(32)) int16 lvec16[16];
-typedef __declspec(align(32)) int32 lvec32[8];
-typedef __declspec(align(32)) int8 lvec8[32];
-typedef __declspec(align(32)) uint16 ulvec16[16];
-typedef __declspec(align(32)) uint32 ulvec32[8];
-typedef __declspec(align(32)) uint8 ulvec8[32];
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
#elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
@@ -478,32 +591,32 @@
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#endif
-typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
-typedef uint32 __attribute__((vector_size(16))) uvec32;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
-typedef int16 __attribute__((vector_size(32))) lvec16;
-typedef int32 __attribute__((vector_size(32))) lvec32;
-typedef int8 __attribute__((vector_size(32))) lvec8;
-typedef uint16 __attribute__((vector_size(32))) ulvec16;
-typedef uint32 __attribute__((vector_size(32))) ulvec32;
-typedef uint8 __attribute__((vector_size(32))) ulvec8;
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
-typedef int16 vec16[8];
-typedef int32 vec32[4];
-typedef int8 vec8[16];
-typedef uint16 uvec16[8];
-typedef uint32 uvec32[4];
-typedef uint8 uvec8[16];
-typedef int16 lvec16[16];
-typedef int32 lvec32[8];
-typedef int8 lvec8[32];
-typedef uint16 ulvec16[16];
-typedef uint32 ulvec32[8];
-typedef uint8 ulvec8[32];
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
#endif
#if defined(__aarch64__)
@@ -527,13 +640,13 @@
#else
// This struct is for Intel color conversion.
struct YuvConstants {
- int8 kUVToB[32];
- int8 kUVToG[32];
- int8 kUVToR[32];
- int16 kUVBiasB[16];
- int16 kUVBiasG[16];
- int16 kUVBiasR[16];
- int16 kYToRgb[16];
+ int8_t kUVToB[32];
+ int8_t kUVToG[32];
+ int8_t kUVToR[32];
+ int16_t kUVBiasB[16];
+ int16_t kUVBiasG[16];
+ int16_t kUVBiasR[16];
+ int16_t kYToRgb[16];
};
// Offsets into YuvConstants structure
@@ -556,6 +669,16 @@
extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants); // JPeg
extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants); // BT.709
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size) \
+ uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \
+ uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+ free(var##_mem); \
+ var = 0
+
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP
#else
@@ -568,71 +691,6 @@
#else
#define LABELALIGN
#endif
-#if defined(__native_client__) && defined(__x86_64__)
-// r14 is used for MEMOP macros.
-#define NACL_R14 "r14",
-#define BUNDLELOCK ".bundle_lock\n"
-#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
-#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) #offset "(,%q" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
- #offset "(%q" #base ",%q" #index "," #scale ")"
-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
- " (%%r15,%%r14),%%" #reg "\n" BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
- " %%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
- " (%%r15,%%r14),%" #arg "\n" BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #opcode \
- " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
- BUNDLELOCK \
- "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" #op \
- " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" BUNDLEUNLOCK
-#else // defined(__native_client__) && defined(__x86_64__)
-#define NACL_R14
-#define BUNDLEALIGN
-#define MEMACCESS(base) "(%" #base ")"
-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
-#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) #offset "(,%" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
- #offset "(%" #base ",%" #index "," #scale ")"
-#define MEMMOVESTRING(s, d)
-#define MEMSTORESTRING(reg, d)
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
- #opcode " %%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
- #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 \
- ",%%" #reg2 "\n"
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
- #op " $" #sel ",%%" #reg "," #offset "(%" #base ",%" #index "," #scale ")\n"
-#endif // defined(__native_client__) && defined(__x86_64__)
-
-#if defined(__arm__) || defined(__aarch64__)
-#undef MEMACCESS
-#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
-#else
-#define MEMACCESS(base)
-#endif
-#endif
// Intel Code Analizer markers. Insert IACA_START IACA_END around code to be
// measured and then run with iaca -64 libyuv_unittest.
@@ -685,2473 +743,3291 @@
IACA_UD_BYTES \
}
-void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_NEON(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I444ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB4444Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB1555Row_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV12ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-
-void I422ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToRGBARow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422AlphaToARGBRow_MSA(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToRGB24Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToRGB565Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB4444Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB1555Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV12ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV12ToRGB565Row_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV21ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void UYVYToARGBRow_MSA(const uint8* src_uyvy,
- uint8* dst_argb,
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_MSA(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
int width);
-void ARGBToUVRow_NEON(const uint8* src_argb,
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_MSA(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_MSA(const uint8* src_argb,
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_NEON(const uint8* src_argb,
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_MMI(const uint8_t* src_argb0,
+ int src_stride_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_NEON(const uint8* src_bgra,
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_NEON(const uint8* src_abgr,
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGBAToUVRow_NEON(const uint8* src_rgba,
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RAWToUVRow_NEON(const uint8* src_raw,
+void RAWToUVRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_MSA(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_MSA(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_MSA(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGBAToUVRow_MSA(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB24ToUVRow_MSA(const uint8* src_rgb24,
- int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RAWToUVRow_MSA(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
-void BGRAToYRow_MSA(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_MSA(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_MSA(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_MSA(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
-void BGRAToUVRow_DSPR2(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToYRow_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToUVRow_DSPR2(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGBToYRow_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
-void ABGRToYRow_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToUVRow_DSPR2(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGBAToYRow_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
-void ARGBToUVRow_DSPR2(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555,
- uint8* dst_y,
- int width);
-void BGRAToYRow_Any_DSPR2(const uint8* src_bgra, uint8* dst_y, int width);
-void ARGBToYRow_Any_DSPR2(const uint8* src_argb, uint8* dst_y, int width);
-void ABGRToYRow_Any_DSPR2(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_DSPR2(const uint8* src_rgba, uint8* dst_y, int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444,
- uint8* dst_y,
- int width);
-void BGRAToYRow_Any_MSA(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_MSA(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_MSA(const uint8* src_rgba, uint8* dst_y, int width);
-void ARGBToYJRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_MSA(const uint8* src_argb, uint8* dst_y, int width);
-void RGB24ToYRow_Any_MSA(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_MSA(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_MSA(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_MSA(const uint8* src_argb1555, uint8* dst_y, int width);
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
-void ARGBToUVRow_AVX2(const uint8* src_argb,
+void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb,
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_Any_MSA(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_Any_MSA(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24,
- int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RAWToUVRow_Any_NEON(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565,
- int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
- int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
- int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGBToUVJRow_Any_MSA(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_Any_MSA(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ABGRToUVRow_Any_MSA(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGBAToUVRow_Any_MSA(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGB24ToUVRow_Any_MSA(const uint8* src_rgb24,
- int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RAWToUVRow_Any_MSA(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB565ToUVRow_Any_MSA(const uint8* src_rgb565,
- int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_Any_MSA(const uint8* src_argb1555,
- int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_Any_DSPR2(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RAWToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_Any_DSPR2(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void RGBAToUVRow_Any_DSPR2(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGBToUVRow_Any_DSPR2(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void ARGBToUVRow_C(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_C(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVRow_C(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUVJRow_C(const uint8* src_argb,
- int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void BGRAToUVRow_C(const uint8* src_bgra,
- int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+void BGRAToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ABGRToUVRow_C(const uint8* src_abgr,
- int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+void ABGRToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGBAToUVRow_C(const uint8* src_rgba,
- int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+void RGBAToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24,
- int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RAWToUVRow_C(const uint8* src_raw,
- int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+void RAWToUVRow_C(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565,
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void ARGBToUV444Row_C(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
-void MirrorRow_MSA(const uint8* src, uint8* dst, int width);
-void MirrorRow_C(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
-void MirrorUVRow_SSSE3(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void MirrorUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MirrorUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_MSA(const uint8* src, uint8* dst, int width);
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void SplitUVRow_AVX2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void MirrorUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void SplitUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void SplitUVRow_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void SplitUVRow_Any_SSE2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void SplitUVRow_Any_AVX2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void SplitUVRow_Any_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
- int width);
+void MirrorUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
-void MergeUVRow_C(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void SplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_SSE2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_AVX2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_NEON(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_MSA(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_Any_NEON(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void MergeUVRow_Any_MSA(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void SplitUVRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
-void CopyRow_C(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+void MergeUVRow_C(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_MSA(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_MMI(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width);
+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeUVRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
-void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+void SplitRGBRow_C(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
+void SplitRGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width);
-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+void MergeRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void MergeRGBRow_Any_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+void MergeRGBRow_Any_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width);
+
+void MergeUVRow_16_C(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale, /* 64 for 10 bit */
+ int width);
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width);
+
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+
+void Convert8To16Row_C(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width);
+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int scale,
+ int width);
+
+void Convert16To8Row_C(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width);
+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
+ uint8_t* dst_ptr,
+ int scale,
+ int width);
+
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_a,
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_a,
+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb,
- uint8* dst_a,
+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y,
- uint8* dst_argb,
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y,
- uint8* dst_argb,
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
+void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void SetRow_C(uint8* dst, uint8 v8, int count);
-void SetRow_X86(uint8* dst, uint8 v8, int count);
-void SetRow_ERMS(uint8* dst, uint8 v8, int count);
-void SetRow_NEON(uint8* dst, uint8 v8, int count);
-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_C(uint8_t* dst, uint8_t v8, int width);
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_MSA(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
// ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_MSA(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
int width);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
int width);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
int width);
-void ARGBShuffleRow_Any_MSA(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
+ int width);
+void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint8_t* param,
int width);
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
- uint8* dst_argb,
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width);
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width);
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
-void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
- uint8* dst_argb,
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width);
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width);
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width);
-void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
-void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24,
- uint8* dst_argb,
- int width);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width);
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width);
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565,
- uint8* dst_argb,
+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565,
- uint8* dst_argb,
- int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24,
- uint8* dst_argb,
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void RGB24ToARGBRow_Any_MSA(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToARGBRow_Any_MSA(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RAWToRGB24Row_Any_MSA(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565,
- uint8* dst_argb,
+
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void RGB565ToARGBRow_Any_MSA(const uint8* src_rgb565,
- uint8* dst_argb,
- int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGB1555ToARGBRow_Any_MSA(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void RGB24ToARGBRow_Any_DSPR2(const uint8* src_rgb24,
- uint8* dst_argb,
+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void RAWToARGBRow_Any_DSPR2(const uint8* src_raw, uint8* dst_argb, int width);
-void RGB565ToARGBRow_Any_DSPR2(const uint8* src_rgb565,
- uint8* dst_argb,
- int width);
-void ARGB1555ToARGBRow_Any_DSPR2(const uint8* src_argb1555,
- uint8* dst_argb,
- int width);
-void ARGB4444ToARGBRow_Any_DSPR2(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGB4444ToARGBRow_Any_MSA(const uint8* src_argb4444,
- uint8* dst_argb,
- int width);
-
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB565DitherRow_C(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
- int width);
-void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
-void I444ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
+ int width);
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
+ int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
+ int width);
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width);
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width);
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToAR30Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_C(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_C(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_C(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_C(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void UYVYToARGBRow_C(const uint8* src_uyvy,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToRGBARow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToRGB24Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGBARow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGBARow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+
+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV12ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_MSA(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
// ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBBlendRow_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBBlendRow_C(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
// Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void BlendPlaneRow_AVX2(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void BlendPlaneRow_C(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_MMI(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width);
+void BlendPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void BlendPlaneRow_C(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width);
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBMultiplyRow_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBMultiplyRow_Any_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
// ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBAddRow_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBAddRow_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBAddRow_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBAddRow_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBAddRow_Any_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBSubtractRow_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width);
-void ARGBSubtractRow_Any_MSA(const uint8* src_argb,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width);
+void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
int width);
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
int width);
-void ARGBToRGB24Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_MSA(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToARGB4444Row_Any_MSA(const uint8* src_argb,
- uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBToRGB565DitherRow_Any_MSA(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
int width);
-void I444ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ const uint32_t param,
+ int width);
+
+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ const struct YuvConstants* yuvconstants,
+ int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width);
+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I444ToARGBRow_Any_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB4444Row_Any_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_Any_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGB1555Row_Any_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I411ToARGBRow_Any_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void NV12ToARGBRow_Any_DSPR2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width);
-void I444ToARGBRow_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGBRow_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGBARow_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422AlphaToARGBRow_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB24Row_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToRGB565Row_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB4444Row_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void I422ToARGB1555Row_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToARGBRow_Any_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV12ToRGB565Row_Any_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void NV21ToARGBRow_Any_MSA(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToARGBRow_Any_MSA(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void UYVYToARGBRow_Any_MSA(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_MSA(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_C(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2,
- int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8* src_uyvy,
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_MSA(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_MSA(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_C(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_MSA(const uint8* src_uyvy,
- int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
-void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr,
+ int src_stride_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width);
+void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width);
+void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_uv,
+ int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+ int stride_ayuv,
+ uint8_t* dst_vu,
+ int width);
-void I422ToYUY2Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width);
-void I422ToUYVYRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width);
-void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width);
-void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width);
-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width);
-void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width);
-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void I422ToYUY2Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width);
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width);
-void I422ToUYVYRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width);
-void I422ToYUY2Row_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width);
+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
-void I422ToUYVYRow_Any_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
+ int width);
+void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ptr,
int width);
// Effects related row functions.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
-void ARGBAttenuateRow_Any_MSA(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
int width);
// Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32 fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- int width);
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width);
-
-void ARGBSepiaRow_C(uint8* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
-void ARGBSepiaRow_MSA(uint8* dst_argb, int width);
-
-void ARGBColorMatrixRow_C(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+extern const uint32_t fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
- int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
+ int width);
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width);
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width);
-void ARGBQuantizeRow_C(uint8* dst_argb,
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width);
+
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+
+void RGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width);
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb,
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width);
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width);
-void ARGBShadeRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+ uint32_t value);
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
+ uint32_t value);
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value);
-void ARGBShadeRow_MSA(const uint8* src_argb,
- uint8* dst_argb,
+ uint32_t value);
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value);
+ uint32_t value);
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value);
// Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft,
- const int32* botleft,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
int width,
int area,
- uint8* dst,
+ uint8_t* dst,
int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width);
-void CumulativeSumToAverageRow_C(const int32* topleft,
- const int32* botleft,
- int width,
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width);
+
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+ const int32_t* bl,
+ int w,
int area,
- uint8* dst,
+ uint8_t* dst,
int count);
-void ComputeCumulativeSumRow_C(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width);
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb,
+void ARGBAffineRow_C(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width);
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb,
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
- const float* uv_dudv,
+ uint8_t* dst_argb,
+ const float* src_dudv,
int width);
// Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
+void InterpolateRow_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
int width,
int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
- int width,
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
- int width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
- int width,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int dst_width,
int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
- int width,
- int source_y_fraction);
-void InterpolateRow_MSA(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
int width,
int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr,
- const uint8* src_ptr,
- ptrdiff_t src_stride_ptr,
- int width,
- int source_y_fraction);
-void InterpolateRow_Any_MSA(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width,
+ int source_y_fraction);
+void InterpolateRow_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride_ptr,
int width,
int source_y_fraction);
-void InterpolateRow_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
- ptrdiff_t src_stride_ptr,
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
int width,
int source_y_fraction);
// Sobel images.
-void SobelXRow_C(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width);
-void SobelXRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width);
-void SobelXRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width);
-void SobelYRow_C(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelXRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelXRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width);
+void SobelYRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width);
-void SobelYRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width);
-void SobelYRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width);
-void SobelRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelYRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelYRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width);
+void SobelRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width);
-void SobelToPlaneRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width);
-void SobelXYRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width);
+void SobelXYRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelXYRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelXYRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width);
+void SobelRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelRow_Any_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelToPlaneRow_Any_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_Any_NEON(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void SobelXYRow_Any_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_Any_MSA(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
+ int width);
+void SobelXYRow_Any_MMI(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_ptr,
int width);
-void ARGBPolynomialRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width);
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width);
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width);
// Scale and convert to half float.
-void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloatRow_Any_SSE2(const uint16* src,
- uint16* dst,
+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_F16C(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width);
-void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloatRow_Any_AVX2(const uint16* src,
- uint16* dst,
- float scale,
- int width);
-void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloatRow_Any_F16C(const uint16* src,
- uint16* dst,
- float scale,
- int width);
-void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloat1Row_Any_F16C(const uint16* src,
- uint16* dst,
+void HalfFloat1Row_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_Any_F16C(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width);
-void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloatRow_Any_NEON(const uint16* src,
- uint16* dst,
- float scale,
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
int width);
-void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
-void HalfFloat1Row_Any_NEON(const uint16* src,
- uint16* dst,
- float scale,
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width);
+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
+ float param,
+ int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
+ float* dst_ptr,
+ float param,
+ int width);
-void ARGBLumaColorTableRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* luma,
+ uint32_t lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff);
+ const uint8_t* luma,
+ uint32_t lumacoeff);
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width);
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
+
+void FloatDivToByteRow_C(const float* src_weights,
+ const float* src_values,
+ uint8_t* dst_out,
+ uint8_t* dst_mask,
+ int width);
+void FloatDivToByteRow_NEON(const float* src_weights,
+ const float* src_values,
+ uint8_t* dst_out,
+ uint8_t* dst_mask,
+ int width);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/include/libyuv/scale.h b/files/include/libyuv/scale.h
index 6d6b9a8..23ba163 100644
--- a/files/include/libyuv/scale.h
+++ b/files/include/libyuv/scale.h
@@ -28,22 +28,22 @@
// Scale a YUV plane.
LIBYUV_API
-void ScalePlane(const uint8* src,
+void ScalePlane(const uint8_t* src,
int src_stride,
int src_width,
int src_height,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
-void ScalePlane_16(const uint16* src,
+void ScalePlane_16(const uint16_t* src,
int src_stride,
int src_width,
int src_height,
- uint16* dst,
+ uint16_t* dst,
int dst_stride,
int dst_width,
int dst_height,
@@ -60,38 +60,86 @@
// Returns 0 if successful.
LIBYUV_API
-int I420Scale(const uint8* src_y,
+int I420Scale(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_width,
int src_height,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
enum FilterMode filtering);
LIBYUV_API
-int I420Scale_16(const uint16* src_y,
+int I420Scale_16(const uint16_t* src_y,
int src_stride_y,
- const uint16* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint16* src_v,
+ const uint16_t* src_v,
int src_stride_v,
int src_width,
int src_height,
- uint16* dst_y,
+ uint16_t* dst_y,
int dst_stride_y,
- uint16* dst_u,
+ uint16_t* dst_u,
int dst_stride_u,
- uint16* dst_v,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
@@ -100,17 +148,17 @@
#ifdef __cplusplus
// Legacy API. Deprecated.
LIBYUV_API
-int Scale(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
+int Scale(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
int src_stride_y,
int src_stride_u,
int src_stride_v,
int src_width,
int src_height,
- uint8* dst_y,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_y,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int dst_stride_y,
int dst_stride_u,
int dst_stride_v,
@@ -118,17 +166,6 @@
int dst_height,
LIBYUV_BOOL interpolate);
-// Legacy API. Deprecated.
-LIBYUV_API
-int ScaleOffset(const uint8* src_i420,
- int src_width,
- int src_height,
- uint8* dst_i420,
- int dst_width,
- int dst_height,
- int dst_yoffset,
- LIBYUV_BOOL interpolate);
-
// For testing, allow disabling of specialized scalers.
LIBYUV_API
void SetUseReferenceImpl(LIBYUV_BOOL use);
diff --git a/files/include/libyuv/scale_argb.h b/files/include/libyuv/scale_argb.h
index 3d25e57..7641f18 100644
--- a/files/include/libyuv/scale_argb.h
+++ b/files/include/libyuv/scale_argb.h
@@ -20,11 +20,11 @@
#endif
LIBYUV_API
-int ARGBScale(const uint8* src_argb,
+int ARGBScale(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -32,11 +32,11 @@
// Clipped scale takes destination rectangle coordinates for clip values.
LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb,
+int ARGBScaleClip(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -48,18 +48,18 @@
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y,
+int YUVToARGBScaleClip(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint32 src_fourcc,
+ uint32_t src_fourcc,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- uint32 dst_fourcc,
+ uint32_t dst_fourcc,
int dst_width,
int dst_height,
int clip_x,
diff --git a/files/include/libyuv/scale_row.h b/files/include/libyuv/scale_row.h
index edb46cc..6e207a9 100644
--- a/files/include/libyuv/scale_row.h
+++ b/files/include/libyuv/scale_row.h
@@ -19,17 +19,20 @@
extern "C" {
#endif
-#if defined(__pnacl__) || defined(__CLR_VER) || \
- (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) || \
+ (defined(__native_client__) && defined(__x86_64__)) || \
+ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#define LIBYUV_DISABLE_X86
#endif
#endif
-
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -55,6 +58,7 @@
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_FIXEDDIV1_X86
#define HAS_FIXEDDIV_X86
+#define HAS_SCALEADDROW_SSE2
#define HAS_SCALEARGBCOLS_SSE2
#define HAS_SCALEARGBCOLSUP2_SSE2
#define HAS_SCALEARGBFILTERCOLS_SSSE3
@@ -66,7 +70,6 @@
#define HAS_SCALEROWDOWN34_SSSE3
#define HAS_SCALEROWDOWN38_SSSE3
#define HAS_SCALEROWDOWN4_SSSE3
-#define HAS_SCALEADDROW_SSE2
#endif
// The following are available on all x86 platforms, but
@@ -81,9 +84,11 @@
#endif
// The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_SCALEADDROW_NEON
#define HAS_SCALEARGBCOLS_NEON
+#define HAS_SCALEARGBFILTERCOLS_NEON
#define HAS_SCALEARGBROWDOWN2_NEON
#define HAS_SCALEARGBROWDOWNEVEN_NEON
#define HAS_SCALEFILTERCOLS_NEON
@@ -91,26 +96,36 @@
#define HAS_SCALEROWDOWN34_NEON
#define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON
-#define HAS_SCALEARGBFILTERCOLS_NEON
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_DSPR2) && !defined(__native_client__) && \
- defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
-#define HAS_SCALEADDROW_DSPR2
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
#define HAS_SCALEARGBROWDOWN2_MSA
#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
#define HAS_SCALEROWDOWN2_MSA
-#define HAS_SCALEROWDOWN4_MSA
+#define HAS_SCALEROWDOWN34_MSA
#define HAS_SCALEROWDOWN38_MSA
-#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEROWDOWN4_MSA
+#endif
+
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+#define HAS_FIXEDDIV1_MIPS
+#define HAS_FIXEDDIV_MIPS
+#define HAS_SCALEADDROW_16_MMI
+#define HAS_SCALEADDROW_MMI
+#define HAS_SCALEARGBCOLS_MMI
+#define HAS_SCALEARGBCOLSUP2_MMI
+#define HAS_SCALEARGBROWDOWN2_MMI
+#define HAS_SCALEARGBROWDOWNEVEN_MMI
+#define HAS_SCALECOLS_16_MMI
+#define HAS_SCALECOLS_MMI
+#define HAS_SCALEROWDOWN2_16_MMI
+#define HAS_SCALEROWDOWN2_MMI
+#define HAS_SCALEROWDOWN4_16_MMI
+#define HAS_SCALEROWDOWN4_MMI
#endif
// Scale ARGB vertically with bilinear interpolation.
@@ -119,8 +134,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int y,
int dy,
@@ -132,8 +147,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_argb,
- uint16* dst_argb,
+ const uint16_t* src_argb,
+ uint16_t* dst_argb,
int x,
int y,
int dy,
@@ -150,12 +165,17 @@
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_C(int num, int div);
int FixedDiv_X86(int num, int div);
+int FixedDiv_MIPS(int num, int div);
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div);
int FixedDiv1_X86(int num, int div);
+int FixedDiv1_MIPS(int num, int div);
#ifdef HAS_FIXEDDIV_X86
#define FixedDiv FixedDiv_X86
#define FixedDiv1 FixedDiv1_X86
+#elif defined HAS_FIXEDDIV_MIPS
+#define FixedDiv FixedDiv_MIPS
+#define FixedDiv1 FixedDiv1_MIPS
#else
#define FixedDiv FixedDiv_C
#define FixedDiv1 FixedDiv1_C
@@ -172,405 +192,441 @@
int* dx,
int* dy);
-void ScaleRowDown2_C(const uint8* src_ptr,
+void ScaleRowDown2_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr,
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr,
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr,
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr,
+void ScaleRowDown4_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr,
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr,
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr,
+void ScaleRowDown34_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr,
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* d,
+ uint8_t* d,
int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* d,
+ uint16_t* d,
int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* d,
+ uint8_t* d,
int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* d,
+ uint16_t* d,
int dst_width);
-void ScaleCols_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleCols_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleColsUp2_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int,
int);
-void ScaleColsUp2_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int,
int);
-void ScaleFilterCols_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
- int x,
+ int x32,
int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
- int x,
+ int x32,
int dx);
-void ScaleRowDown38_C(const uint8* src_ptr,
+void ScaleRowDown38_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr,
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr,
+ uint16_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr,
+ uint16_t* dst_ptr,
int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb,
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBCols64_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
- int x,
+ int x32,
int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int,
int);
-void ScaleARGBFilterCols_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
- int x,
+ int x32,
int dx);
// Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr,
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr,
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr,
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr,
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr,
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
-void ScaleFilterCols_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
// ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
// ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
@@ -584,274 +640,274 @@
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_argb,
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_argb,
- ptrdiff_t src_stride,
- uint8_t* dst_argb,
- int dst_width);
-void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_argb,
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
+ int src_stepx,
uint8_t* dst_argb,
int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
- int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb,
- int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_argb,
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int32_t src_stepx,
- uint8_t* dst_argb,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleARGBRowDownEvenBox_Any_MSA(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int32_t src_stepx,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_ptr,
int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.
// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr,
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width);
-void ScaleRowDown4_NEON(const uint8* src_ptr,
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr,
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width);
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
-void ScaleFilterCols_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx);
-void ScaleRowDown2_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* d,
- int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst,
- int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr,
- int dst_width);
-void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_DSPR2(const uint8* src_ptr,
- uint16* dst_ptr,
- int src_width);
-
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
@@ -885,29 +941,47 @@
uint8_t* dst_ptr,
int dst_width);
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width);
+
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8_t* dst,
+ uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
@@ -920,7 +994,111 @@
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width);
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width);
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx);
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx);
+
+void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst_ptr,
+ int dst_width);
+void ScaleAddRow_Any_MMI(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/include/libyuv/version.h b/files/include/libyuv/version.h
index 5b5f539..741ef34 100644
--- a/files/include/libyuv/version.h
+++ b/files/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1652
+#define LIBYUV_VERSION 1732
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/files/include/libyuv/video_common.h b/files/include/libyuv/video_common.h
index f3711c4..ffcbdbf 100644
--- a/files/include/libyuv/video_common.h
+++ b/files/include/libyuv/video_common.h
@@ -28,13 +28,14 @@
// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
// constants are used in a switch.
#ifdef __cplusplus
-#define FOURCC(a, b, c, d) \
- ((static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
- (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#define FOURCC(a, b, c, d) \
+ ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+ (static_cast<uint32_t>(c) << 16) | /* NOLINT */ \
+ (static_cast<uint32_t>(d) << 24)) /* NOLINT */
#else
-#define FOURCC(a, b, c, d) \
- (((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
- ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
+#define FOURCC(a, b, c, d) \
+ (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \
+ ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
#endif
// Some pages discussing FourCC codes:
@@ -49,25 +50,26 @@
// Secondary formats are converted in 2 steps.
// Auxilliary formats call primary converters.
enum FourCC {
- // 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
- FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated.
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+ FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
// 1 Secondary YUV format: row biplanar.
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
- FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
- // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
+ FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
@@ -75,16 +77,10 @@
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
- // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
- FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
- FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
- FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
- FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
// 1 Primary Compressed YUV format.
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
- // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ // 8 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -92,6 +88,7 @@
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
+ FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
@@ -112,7 +109,13 @@
FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
- // 1 Auxiliary compressed YUV format set aside for capturer.
+ // deprecated formats. Not supported, but defined for backward compatibility.
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
FOURCC_H264 = FOURCC('H', '2', '6', '4'),
// Match any fourcc.
@@ -136,6 +139,8 @@
FOURCC_BPP_BGRA = 32,
FOURCC_BPP_ABGR = 32,
FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_AR30 = 32,
+ FOURCC_BPP_AB30 = 32,
FOURCC_BPP_24BG = 24,
FOURCC_BPP_RAW = 24,
FOURCC_BPP_RGBP = 16,
@@ -152,6 +157,8 @@
FOURCC_BPP_J420 = 12,
FOURCC_BPP_J400 = 8,
FOURCC_BPP_H420 = 12,
+ FOURCC_BPP_H422 = 16,
+ FOURCC_BPP_H010 = 24,
FOURCC_BPP_MJPG = 0, // 0 means unknown.
FOURCC_BPP_H264 = 0,
FOURCC_BPP_IYUV = 12,
@@ -174,7 +181,7 @@
};
// Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
#ifdef __cplusplus
} // extern "C"
diff --git a/files/infra/config/OWNERS b/files/infra/config/OWNERS
deleted file mode 100644
index 02eccd5..0000000
--- a/files/infra/config/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-set noparent
-agable@chromium.org
-kjellander@chromium.org
diff --git a/files/infra/config/PRESUBMIT.py b/files/infra/config/PRESUBMIT.py
new file mode 100644
index 0000000..89eaa51
--- /dev/null
+++ b/files/infra/config/PRESUBMIT.py
@@ -0,0 +1,15 @@
+# Copyright 2018 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+
+def CheckChangeOnUpload(input_api, output_api):
+ return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
+
+
+def CheckChangeOnCommit(input_api, output_api):
+ return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/files/infra/config/cq.cfg b/files/infra/config/cq.cfg
index ff0347d..7bcc059 100644
--- a/files/infra/config/cq.cfg
+++ b/files/infra/config/cq.cfg
@@ -2,21 +2,12 @@
# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
version: 1
-cq_name: "libyuv"
cq_status_url: "https://chromium-cq-status.appspot.com"
git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
gerrit {}
-rietveld {
- url: "https://codereview.chromium.org"
-}
-
verifiers {
- reviewer_lgtm {
- committer_list: "project-libyuv-committers"
- dry_run_access_list: "project-libyuv-tryjob-access"
- }
gerrit_cq_ability {
committer_list: "project-libyuv-committers"
dry_run_access_list: "project-libyuv-tryjob-access"
@@ -24,7 +15,7 @@
try_job {
buckets {
- name: "master.tryserver.libyuv"
+ name: "luci.libyuv.try"
builders { name: "win" }
builders { name: "win_rel" }
builders { name: "win_x64_rel" }
@@ -45,7 +36,6 @@
experiment_percentage: 100
}
builders { name: "linux_memcheck" }
- builders { name: "linux_msan" }
builders { name: "linux_tsan2" }
builders { name: "linux_asan" }
builders { name: "linux_msan" }
@@ -53,14 +43,9 @@
builders { name: "linux_ubsan_vptr" }
builders { name: "android" }
builders { name: "android_rel" }
- builders { name: "android_clang" }
builders { name: "android_arm64" }
builders { name: "android_x86" }
builders { name: "android_x64" }
- builders {
- name: "android_mips"
- experiment_percentage: 100
- }
}
}
}
diff --git a/files/libyuv.gni b/files/libyuv.gni
index 89e4d38..8df40ba 100644
--- a/files/libyuv.gni
+++ b/files/libyuv.gni
@@ -13,8 +13,11 @@
declare_args() {
libyuv_include_tests = !build_with_chromium
libyuv_disable_jpeg = false
- libyuv_use_neon = (current_cpu == "arm64" ||
- (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)))
- libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") &&
- mips_use_msa
+ libyuv_use_neon =
+ current_cpu == "arm64" ||
+ (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon))
+ libyuv_use_msa =
+ (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa
+ libyuv_use_mmi =
+ (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi
}
diff --git a/files/linux.mk b/files/linux.mk
index 923345a..e9a26a7 100644
--- a/files/linux.mk
+++ b/files/linux.mk
@@ -13,6 +13,8 @@
source/compare.o \
source/compare_common.o \
source/compare_gcc.o \
+ source/compare_mmi.o \
+ source/compare_msa.o \
source/compare_neon64.o \
source/compare_neon.o \
source/compare_win.o \
@@ -32,14 +34,16 @@
source/rotate.o \
source/rotate_common.o \
source/rotate_gcc.o \
- source/rotate_dspr2.o \
+ source/rotate_mmi.o \
+ source/rotate_msa.o \
source/rotate_neon64.o \
source/rotate_neon.o \
source/rotate_win.o \
source/row_any.o \
source/row_common.o \
source/row_gcc.o \
- source/row_dspr2.o \
+ source/row_mmi.o \
+ source/row_msa.o \
source/row_neon64.o \
source/row_neon.o \
source/row_win.o \
@@ -48,7 +52,8 @@
source/scale.o \
source/scale_common.o \
source/scale_gcc.o \
- source/scale_dspr2.o \
+ source/scale_mmi.o \
+ source/scale_msa.o \
source/scale_neon64.o \
source/scale_neon.o \
source/scale_win.o \
@@ -60,14 +65,14 @@
.c.o:
$(CC) -c $(CFLAGS) $*.c -o $*.o
-all: libyuv.a convert cpuid psnr
+all: libyuv.a yuvconvert cpuid psnr
libyuv.a: $(LOCAL_OBJ_FILES)
$(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES)
# A C++ test utility that uses libyuv conversion.
-convert: util/convert.cc libyuv.a
- $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+yuvconvert: util/yuvconvert.cc libyuv.a
+ $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a
# A standalone test utility
psnr: util/psnr.cc
@@ -80,4 +85,4 @@
$(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a
clean:
- /bin/rm -f source/*.o *.ii *.s libyuv.a convert cpuid psnr
+ /bin/rm -f source/*.o *.ii *.s libyuv.a yuvconvert cpuid psnr
diff --git a/files/public.mk b/files/public.mk
index 090d8cb..1342307 100644
--- a/files/public.mk
+++ b/files/public.mk
@@ -1,13 +1,13 @@
-# This file contains all the common make variables which are useful for
-# anyone depending on this library.
-# Note that dependencies on NDK are not directly listed since NDK auto adds
-# them.
-
-LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
-
-LIBYUV_C_FLAGS :=
-
-LIBYUV_CPP_FLAGS :=
-
-LIBYUV_LDLIBS :=
-LIBYUV_DEP_MODULES :=
+# This file contains all the common make variables which are useful for
+# anyone depending on this library.
+# Note that dependencies on NDK are not directly listed since NDK auto adds
+# them.
+
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
+
+LIBYUV_C_FLAGS :=
+
+LIBYUV_CPP_FLAGS :=
+
+LIBYUV_LDLIBS :=
+LIBYUV_DEP_MODULES :=
diff --git a/files/source/compare.cc b/files/source/compare.cc
index 1facd27..5aa3a4d 100644
--- a/files/source/compare.cc
+++ b/files/source/compare.cc
@@ -29,10 +29,11 @@
// hash seed of 5381 recommended.
LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
const int kBlockSize = 1 << 15; // 32768;
int remainder;
- uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+ uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
+ HashDjb2_C;
#if defined(HAS_HASHDJB2_SSE41)
if (TestCpuFlag(kCpuHasSSE41)) {
HashDjb2_SSE = HashDjb2_SSE41;
@@ -44,7 +45,7 @@
}
#endif
- while (count >= (uint64)(kBlockSize)) {
+ while (count >= (uint64_t)(kBlockSize)) {
seed = HashDjb2_SSE(src, kBlockSize, seed);
src += kBlockSize;
count -= kBlockSize;
@@ -62,7 +63,7 @@
return seed;
}
-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
@@ -93,8 +94,11 @@
// Scan an opaque argb image and return fourcc based on alpha offset.
// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
- uint32 fourcc = 0;
+uint32_t ARGBDetect(const uint8_t* argb,
+ int stride_argb,
+ int width,
+ int height) {
+ uint32_t fourcc = 0;
int h;
// Coalesce rows.
@@ -110,20 +114,86 @@
return fourcc;
}
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ const int kBlockSize = 1 << 15; // 32768;
+ const int kSimdSize = 64;
+ // SIMD for multiple of 64, and C for remainder
+ int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+ uint64_t diff = 0;
+ int i;
+ uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+ int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ HammingDistance = HammingDistance_NEON;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ HammingDistance = HammingDistance_SSSE3;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+ if (TestCpuFlag(kCpuHasSSE42)) {
+ HammingDistance = HammingDistance_SSE42;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HammingDistance = HammingDistance_AVX2;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ HammingDistance = HammingDistance_MSA;
+ }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ HammingDistance = HammingDistance_MMI;
+ }
+#endif
+
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ diff += HammingDistance(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & (kSimdSize - 1);
+ if (remainder) {
+ diff += HammingDistance_C(src_a, src_b, remainder);
+ }
+ return diff;
+}
+
// TODO(fbarchard): Refactor into row function.
LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
- const uint8* src_b,
- int count) {
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
// SumSquareError returns values 0 to 65535 for each squared difference.
- // Up to 65536 of those can be summed and remain within a uint32.
- // After each block of 65536 pixels, accumulate into a uint64.
+ // Up to 65536 of those can be summed and remain within a uint32_t.
+ // After each block of 65536 pixels, accumulate into a uint64_t.
const int kBlockSize = 65536;
int remainder = count & (kBlockSize - 1) & ~31;
- uint64 sse = 0;
+ uint64_t sse = 0;
int i;
- uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
- SumSquareError_C;
+ uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+ int count) = SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SumSquareError = SumSquareError_NEON;
@@ -141,6 +211,16 @@
SumSquareError = SumSquareError_AVX2;
}
#endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SumSquareError = SumSquareError_MSA;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SumSquareError = SumSquareError_MMI;
+ }
+#endif
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif
@@ -162,13 +242,13 @@
}
LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a,
- int stride_a,
- const uint8* src_b,
- int stride_b,
- int width,
- int height) {
- uint64 sse = 0;
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+ int stride_a,
+ const uint8_t* src_b,
+ int stride_b,
+ int width,
+ int height) {
+ uint64_t sse = 0;
int h;
// Coalesce rows.
if (stride_a == width && stride_b == width) {
@@ -185,7 +265,7 @@
}
LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
double psnr;
if (sse > 0) {
double mse = (double)count / (double)sse;
@@ -194,65 +274,67 @@
psnr = kMaxPsnr; // Limit to prevent divide by 0
}
- if (psnr > kMaxPsnr)
+ if (psnr > kMaxPsnr) {
psnr = kMaxPsnr;
+ }
return psnr;
}
LIBYUV_API
-double CalcFramePsnr(const uint8* src_a,
+double CalcFramePsnr(const uint8_t* src_a,
int stride_a,
- const uint8* src_b,
+ const uint8_t* src_b,
int stride_b,
int width,
int height) {
- const uint64 samples = width * height;
- const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
- stride_b, width, height);
+ const uint64_t samples = (uint64_t)width * (uint64_t)height;
+ const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+ stride_b, width, height);
return SumSquareErrorToPsnr(sse, samples);
}
LIBYUV_API
-double I420Psnr(const uint8* src_y_a,
+double I420Psnr(const uint8_t* src_y_a,
int stride_y_a,
- const uint8* src_u_a,
+ const uint8_t* src_u_a,
int stride_u_a,
- const uint8* src_v_a,
+ const uint8_t* src_v_a,
int stride_v_a,
- const uint8* src_y_b,
+ const uint8_t* src_y_b,
int stride_y_b,
- const uint8* src_u_b,
+ const uint8_t* src_u_b,
int stride_u_b,
- const uint8* src_v_b,
+ const uint8_t* src_v_b,
int stride_v_b,
int width,
int height) {
- const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a, src_y_b,
- stride_y_b, width, height);
+ const uint64_t sse_y = ComputeSumSquareErrorPlane(
+ src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
const int width_uv = (width + 1) >> 1;
const int height_uv = (height + 1) >> 1;
- const uint64 sse_u = ComputeSumSquareErrorPlane(
+ const uint64_t sse_u = ComputeSumSquareErrorPlane(
src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
- const uint64 sse_v = ComputeSumSquareErrorPlane(
+ const uint64_t sse_v = ComputeSumSquareErrorPlane(
src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
- const uint64 samples = width * height + 2 * (width_uv * height_uv);
- const uint64 sse = sse_y + sse_u + sse_v;
+ const uint64_t samples = (uint64_t)width * (uint64_t)height +
+ 2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+ const uint64_t sse = sse_y + sse_u + sse_v;
return SumSquareErrorToPsnr(sse, samples);
}
-static const int64 cc1 = 26634; // (64^2*(.01*255)^2
-static const int64 cc2 = 239708; // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
-static double Ssim8x8_C(const uint8* src_a,
+static double Ssim8x8_C(const uint8_t* src_a,
int stride_a,
- const uint8* src_b,
+ const uint8_t* src_b,
int stride_b) {
- int64 sum_a = 0;
- int64 sum_b = 0;
- int64 sum_sq_a = 0;
- int64 sum_sq_b = 0;
- int64 sum_axb = 0;
+ int64_t sum_a = 0;
+ int64_t sum_b = 0;
+ int64_t sum_sq_a = 0;
+ int64_t sum_sq_b = 0;
+ int64_t sum_axb = 0;
int i;
for (i = 0; i < 8; ++i) {
@@ -270,20 +352,20 @@
}
{
- const int64 count = 64;
+ const int64_t count = 64;
// scale the constants by number of pixels
- const int64 c1 = (cc1 * count * count) >> 12;
- const int64 c2 = (cc2 * count * count) >> 12;
+ const int64_t c1 = (cc1 * count * count) >> 12;
+ const int64_t c2 = (cc2 * count * count) >> 12;
- const int64 sum_a_x_sum_b = sum_a * sum_b;
+ const int64_t sum_a_x_sum_b = sum_a * sum_b;
- const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
- (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+ const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
- const int64 sum_a_sq = sum_a * sum_a;
- const int64 sum_b_sq = sum_b * sum_b;
+ const int64_t sum_a_sq = sum_a * sum_a;
+ const int64_t sum_b_sq = sum_b * sum_b;
- const int64 ssim_d =
+ const int64_t ssim_d =
(sum_a_sq + sum_b_sq + c1) *
(count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
@@ -298,15 +380,15 @@
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
LIBYUV_API
-double CalcFrameSsim(const uint8* src_a,
+double CalcFrameSsim(const uint8_t* src_a,
int stride_a,
- const uint8* src_b,
+ const uint8_t* src_b,
int stride_b,
int width,
int height) {
int samples = 0;
double ssim_total = 0;
- double (*Ssim8x8)(const uint8* src_a, int stride_a, const uint8* src_b,
+ double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
int stride_b) = Ssim8x8_C;
// sample point start with each 4x4 location
@@ -327,17 +409,17 @@
}
LIBYUV_API
-double I420Ssim(const uint8* src_y_a,
+double I420Ssim(const uint8_t* src_y_a,
int stride_y_a,
- const uint8* src_u_a,
+ const uint8_t* src_u_a,
int stride_u_a,
- const uint8* src_v_a,
+ const uint8_t* src_v_a,
int stride_v_a,
- const uint8* src_y_b,
+ const uint8_t* src_y_b,
int stride_y_b,
- const uint8* src_u_b,
+ const uint8_t* src_u_b,
int stride_u_b,
- const uint8* src_v_b,
+ const uint8_t* src_v_b,
int stride_v_b,
int width,
int height) {
diff --git a/files/source/compare_common.cc b/files/source/compare_common.cc
index 42fc589..d4b170a 100644
--- a/files/source/compare_common.cc
+++ b/files/source/compare_common.cc
@@ -17,20 +17,80 @@
extern "C" {
#endif
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse = 0u;
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count; ++i) {
+ int x = src_a[i] ^ src_b[i];
+ if (x & 1)
+ ++diff;
+ if (x & 2)
+ ++diff;
+ if (x & 4)
+ ++diff;
+ if (x & 8)
+ ++diff;
+ if (x & 16)
+ ++diff;
+ if (x & 32)
+ ++diff;
+ if (x & 64)
+ ++diff;
+ if (x & 128)
+ ++diff;
+ }
+ return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
+ uint32_t u = x - ((x >> 1) & 0x55555555);
+ u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+ diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+ src_a += 4;
+ src_b += 4;
+ }
+
+ for (; i < count; ++i) {
+ uint32_t x = *src_a ^ *src_b;
+ uint32_t u = x - ((x >> 1) & 0x55);
+ u = ((u >> 2) & 0x33) + (u & 0x33);
+ diff += (u + (u >> 4)) & 0x0f;
+ src_a += 1;
+ src_b += 1;
+ }
+
+ return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
int i;
for (i = 0; i < count; ++i) {
int diff = src_a[i] - src_b[i];
- sse += (uint32)(diff * diff);
+ sse += (uint32_t)(diff * diff);
}
return sse;
}
// hash seed of 5381 recommended.
// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
- uint32 hash = seed;
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+ uint32_t hash = seed;
int i;
for (i = 0; i < count; ++i) {
hash += (hash << 5) + src[i];
diff --git a/files/source/compare_gcc.cc b/files/source/compare_gcc.cc
index 64522aa..676527c 100644
--- a/files/source/compare_gcc.cc
+++ b/files/source/compare_gcc.cc
@@ -22,124 +22,334 @@
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse;
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "lea " MEMLEA(0x10, 0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "lea " MEMLEA(0x10, 1) ",%1 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint64_t diff = 0u;
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
+ asm volatile(
+ "xor %3,%3 \n"
+ "xor %%r8,%%r8 \n"
+ "xor %%r9,%%r9 \n"
+ "xor %%r10,%%r10 \n"
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ // Process 32 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%rcx \n"
+ "mov 0x8(%0),%%rdx \n"
+ "xor (%1),%%rcx \n"
+ "xor 0x8(%1),%%rdx \n"
+ "popcnt %%rcx,%%rcx \n"
+ "popcnt %%rdx,%%rdx \n"
+ "mov 0x10(%0),%%rsi \n"
+ "mov 0x18(%0),%%rdi \n"
+ "xor 0x10(%1),%%rsi \n"
+ "xor 0x18(%1),%%rdi \n"
+ "popcnt %%rsi,%%rsi \n"
+ "popcnt %%rdi,%%rdi \n"
+ "add $0x20,%0 \n"
+ "add $0x20,%1 \n"
+ "add %%rcx,%3 \n"
+ "add %%rdx,%%r8 \n"
+ "add %%rsi,%%r9 \n"
+ "add %%rdi,%%r10 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "add %%r8, %3 \n"
+ "add %%r9, %3 \n"
+ "add %%r10, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ :
+ : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+ return static_cast<uint32_t>(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ // Process 16 bytes per loop.
+ LABELALIGN
+ "1: \n"
+ "mov (%0),%%ecx \n"
+ "mov 0x4(%0),%%edx \n"
+ "xor (%1),%%ecx \n"
+ "xor 0x4(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "mov 0x8(%0),%%ecx \n"
+ "mov 0xc(%0),%%edx \n"
+ "xor 0x8(%1),%%ecx \n"
+ "xor 0xc(%1),%%edx \n"
+ "popcnt %%ecx,%%ecx \n"
+ "add %%ecx,%3 \n"
+ "popcnt %%edx,%%edx \n"
+ "add %%edx,%3 \n"
+ "add $0x10,%0 \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "+r"(diff) // %3
+ :
+ : "memory", "cc", "ecx", "edx");
+
+ return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "movdqa %4,%%xmm2 \n"
+ "movdqa %5,%%xmm3 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa 0x10(%0), %%xmm5 \n"
+ "pxor (%0,%1), %%xmm4 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pand %%xmm2,%%xmm6 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm6,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "paddb %%xmm7,%%xmm6 \n"
+ "pxor 0x10(%0,%1),%%xmm5 \n"
+ "add $0x20,%0 \n"
+ "movdqa %%xmm5,%%xmm4 \n"
+ "pand %%xmm2,%%xmm5 \n"
+ "psrlw $0x4,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "pshufb %%xmm5,%%xmm7 \n"
+ "pand %%xmm2,%%xmm4 \n"
+ "movdqa %%xmm3,%%xmm5 \n"
+ "pshufb %%xmm4,%%xmm5 \n"
+ "paddb %%xmm7,%%xmm5 \n"
+ "paddb %%xmm5,%%xmm6 \n"
+ "psadbw %%xmm1,%%xmm6 \n"
+ "paddd %%xmm6,%%xmm0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xaa,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0, %3 \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+
+ return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ asm volatile(
+ "vbroadcastf128 %4,%%ymm2 \n"
+ "vbroadcastf128 %5,%%ymm3 \n"
+ "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqa (%0),%%ymm4 \n"
+ "vmovdqa 0x20(%0), %%ymm5 \n"
+ "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm6 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
+ "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
+ "add $0x40,%0 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm5 \n"
+ "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
+ "vpand %%ymm2,%%ymm4,%%ymm4 \n"
+ "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
+ "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
+ "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
+ "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
+ "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+
+ "vpermq $0xb1,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xaa,%%ymm0,%%ymm1 \n"
+ "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovd %%xmm0, %3 \n"
+ "vzeroupper \n"
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=r"(diff) // %3
+ : "m"(kNibbleMask), // %4
+ "m"(kBitCount) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+ return diff;
+}
+#endif // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
return sse;
}
-static uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
-static uvec32 kHashMul0 = {
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
+static const uvec32 kHashMul0 = {
0x0c3525e1, // 33 ^ 15
0xa3476dc1, // 33 ^ 14
0x3b4039a1, // 33 ^ 13
0x4f5f0981, // 33 ^ 12
};
-static uvec32 kHashMul1 = {
+static const uvec32 kHashMul1 = {
0x30f35d61, // 33 ^ 11
0x855cb541, // 33 ^ 10
0x040a9121, // 33 ^ 9
0x747c7101, // 33 ^ 8
};
-static uvec32 kHashMul2 = {
+static const uvec32 kHashMul2 = {
0xec41d4e1, // 33 ^ 7
0x4cfa3cc1, // 33 ^ 6
0x025528a1, // 33 ^ 5
0x00121881, // 33 ^ 4
};
-static uvec32 kHashMul3 = {
+static const uvec32 kHashMul3 = {
0x00008c61, // 33 ^ 3
0x00000441, // 33 ^ 2
0x00000021, // 33 ^ 1
0x00000001, // 33 ^ 0
};
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- uint32 hash;
- asm volatile (
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "lea " MEMLEA(0x10, 0) ",%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "sub $0x10,%1 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+ uint32_t hash;
+ asm volatile(
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "sub $0x10,%1 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
return hash;
}
#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
diff --git a/files/source/compare_mmi.cc b/files/source/compare_mmi.cc
new file mode 100644
index 0000000..7640d94
--- /dev/null
+++ b/files/source/compare_mmi.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0;
+ uint64_t c1 = 0x5555555555555555;
+ uint64_t c2 = 0x3333333333333333;
+ uint64_t c3 = 0x0f0f0f0f0f0f0f0f;
+ uint32_t c4 = 0x01010101;
+ uint64_t s1 = 1, s2 = 2, s3 = 4;
+ __asm__ volatile(
+ "1: \n\t"
+ "ldc1 %[ta], 0(%[src_a]) \n\t"
+ "ldc1 %[tb], 0(%[src_b]) \n\t"
+ "xor %[temp], %[ta], %[tb] \n\t"
+ "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1
+ "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1
+ "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1
+ "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2)
+ "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2
+ "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2
+ "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t
+ "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4
+ "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4)
+ "and %[temp1], %[temp1], %[c3] \n\t" //&c3
+ "dmfc1 $t0, %[temp1] \n\t"
+ "dsrl32 $t0, $t0, 0 \n\t "
+ "mul $t0, $t0, %[c4] \n\t"
+ "dsrl $t0, $t0, 24 \n\t"
+ "dadd %[diff], %[diff], $t0 \n\t"
+ "dmfc1 $t0, %[temp1] \n\t"
+ "mul $t0, $t0, %[c4] \n\t"
+ "dsrl $t0, $t0, 24 \n\t"
+ "dadd %[diff], %[diff], $t0 \n\t"
+ "daddiu %[src_a], %[src_a], 8 \n\t"
+ "daddiu %[src_b], %[src_b], 8 \n\t"
+ "addiu %[count], %[count], -8 \n\t"
+ "bgtz %[count], 1b \n\t"
+ "nop \n\t"
+ : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b),
+ [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp),
+ [temp1] "+f"(temp1)
+ : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1),
+ [s2] "f"(s2), [s3] "f"(s3)
+ : "memory");
+ return diff;
+}
+
+uint32_t SumSquareError_MMI(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
+ uint32_t sse_hi = 0u, sse_lo = 0u;
+
+ uint64_t src1, src2;
+ uint64_t diff, diff_hi, diff_lo;
+ uint64_t sse_sum, sse_tmp;
+
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t"
+
+ "1: \n\t"
+ "ldc1 %[src1], 0x00(%[src_a]) \n\t"
+ "ldc1 %[src2], 0x00(%[src_b]) \n\t"
+ "pasubub %[diff], %[src1], %[src2] \n\t"
+ "punpcklbh %[diff_lo], %[diff], %[mask] \n\t"
+ "punpckhbh %[diff_hi], %[diff], %[mask] \n\t"
+ "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t"
+ "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
+ "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t"
+ "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t"
+
+ "daddiu %[src_a], %[src_a], 0x08 \n\t"
+ "daddiu %[src_b], %[src_b], 0x08 \n\t"
+ "daddiu %[count], %[count], -0x08 \n\t"
+ "bnez %[count], 1b \n\t"
+
+ "mfc1 %[sse_lo], %[sse_sum] \n\t"
+ "mfhc1 %[sse_hi], %[sse_sum] \n\t"
+ "daddu %[sse], %[sse_hi], %[sse_lo] \n\t"
+ : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1),
+ [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi),
+ [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp),
+ [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo)
+ : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count),
+ [mask] "f"(mask)
+ : "memory");
+
+ return sse;
+}
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/compare_msa.cc b/files/source/compare_msa.cc
new file mode 100644
index 0000000..0b807d3
--- /dev/null
+++ b/files/source/compare_msa.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+ int i;
+ v16u8 src0, src1, src2, src3;
+ v2i64 vec0 = {0}, vec1 = {0};
+
+ for (i = 0; i < count; i += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+ src0 ^= src2;
+ src1 ^= src3;
+ vec0 += __msa_pcnt_d((v2i64)src0);
+ vec1 += __msa_pcnt_d((v2i64)src1);
+ src_a += 32;
+ src_b += 32;
+ }
+
+ vec0 += vec1;
+ diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
+ diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
+ return diff;
+}
+
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
+ int i;
+ v16u8 src0, src1, src2, src3;
+ v8i16 vec0, vec1, vec2, vec3;
+ v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
+ v2i64 tmp0;
+
+ for (i = 0; i < count; i += 32) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+ reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
+ reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
+ reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
+ reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
+ src_a += 32;
+ src_b += 32;
+ }
+
+ reg0 += reg1;
+ reg2 += reg3;
+ reg0 += reg2;
+ tmp0 = __msa_hadd_s_d(reg0, reg0);
+ sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
+ sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
+ return sse;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/files/source/compare_neon.cc b/files/source/compare_neon.cc
index 49aa3b4..2a2181e 100644
--- a/files/source/compare_neon.cc
+++ b/files/source/compare_neon.cc
@@ -21,40 +21,70 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n"
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
+ asm volatile(
+ "vmov.u16 q4, #0 \n" // accumulator
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n"
+ "vld1.8 {q2, q3}, [%1]! \n"
+ "veor.32 q0, q0, q2 \n"
+ "veor.32 q1, q1, q3 \n"
+ "vcnt.i8 q0, q0 \n"
+ "vcnt.i8 q1, q1 \n"
+ "subs %2, %2, #32 \n"
+ "vadd.u8 q0, q0, q1 \n" // 16 byte counts
+ "vpadal.u8 q4, q0 \n" // 8 shorts
+ "bgt 1b \n"
+
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
+ "vpadd.u32 d0, d0, d0 \n"
+ "vmov.32 %3, d0[0] \n"
+
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "q0", "q1", "q2", "q3", "q4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}
diff --git a/files/source/compare_neon64.cc b/files/source/compare_neon64.cc
index f9c7df9..6e8f672 100644
--- a/files/source/compare_neon64.cc
+++ b/files/source/compare_neon64.cc
@@ -20,39 +20,65 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "eor v16.16b, v16.16b, v16.16b \n"
- "eor v18.16b, v18.16b, v18.16b \n"
- "eor v17.16b, v17.16b, v17.16b \n"
- "eor v19.16b, v19.16b, v19.16b \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff;
+ asm volatile(
+ "movi v4.8h, #0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- "subs %w2, %w2, #16 \n"
- "usubl v2.8h, v0.8b, v1.8b \n"
- "usubl2 v3.8h, v0.16b, v1.16b \n"
- "smlal v16.4s, v2.4h, v2.4h \n"
- "smlal v17.4s, v3.4h, v3.4h \n"
- "smlal2 v18.4s, v2.8h, v2.8h \n"
- "smlal2 v19.4s, v3.8h, v3.8h \n"
- "b.gt 1b \n"
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
+ "eor v0.16b, v0.16b, v2.16b \n"
+ "eor v1.16b, v1.16b, v3.16b \n"
+ "cnt v0.16b, v0.16b \n"
+ "cnt v1.16b, v1.16b \n"
+ "subs %w2, %w2, #32 \n"
+ "add v0.16b, v0.16b, v1.16b \n"
+ "uadalp v4.8h, v0.16b \n"
+ "b.gt 1b \n"
- "add v16.4s, v16.4s, v17.4s \n"
- "add v18.4s, v18.4s, v19.4s \n"
- "add v19.4s, v16.4s, v18.4s \n"
- "addv s0, v19.4s \n"
- "fmov %w3, s0 \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+ "uaddlv s4, v4.8h \n"
+ "fmov %w3, s4 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v4");
+ return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
+ asm volatile(
+ "eor v16.16b, v16.16b, v16.16b \n"
+ "eor v18.16b, v18.16b, v18.16b \n"
+ "eor v17.16b, v17.16b, v17.16b \n"
+ "eor v19.16b, v19.16b, v19.16b \n"
+
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "subs %w2, %w2, #16 \n"
+ "usubl v2.8h, v0.8b, v1.8b \n"
+ "usubl2 v3.8h, v0.16b, v1.16b \n"
+ "smlal v16.4s, v2.4h, v2.4h \n"
+ "smlal v17.4s, v3.4h, v3.4h \n"
+ "smlal2 v18.4s, v2.8h, v2.8h \n"
+ "smlal2 v19.4s, v3.8h, v3.8h \n"
+ "b.gt 1b \n"
+
+ "add v16.4s, v16.4s, v17.4s \n"
+ "add v18.4s, v18.4s, v19.4s \n"
+ "add v19.4s, v16.4s, v18.4s \n"
+ "addv s0, v19.4s \n"
+ "fmov %w3, s0 \n"
+ : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+ :
+ : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
diff --git a/files/source/compare_win.cc b/files/source/compare_win.cc
index b17fc8e..d57d3d9 100644
--- a/files/source/compare_win.cc
+++ b/files/source/compare_win.cc
@@ -13,16 +13,35 @@
#include "libyuv/compare_row.h"
#include "libyuv/row.h"
+#if defined(_MSC_VER)
+#include <intrin.h> // For __popcnt
+#endif
+
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-__declspec(naked) uint32
- SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t diff = 0u;
+
+ int i;
+ for (i = 0; i < count - 3; i += 4) {
+ uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
+ src_a += 4;
+ src_b += 4;
+ diff += __popcnt(x);
+ }
+ return diff;
+}
+
+__declspec(naked) uint32_t
+ SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
@@ -62,8 +81,8 @@
#if _MSC_VER >= 1700
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
#pragma warning(disable : 4752)
-__declspec(naked) uint32
- SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+__declspec(naked) uint32_t
+ SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
@@ -127,8 +146,8 @@
0x00000001, // 33 ^ 0
};
-__declspec(naked) uint32
- HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+ HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
@@ -178,8 +197,8 @@
// Visual C 2012 required for AVX2.
#if _MSC_VER >= 1700
-__declspec(naked) uint32
- HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+ HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
__asm {
mov eax, [esp + 4] // src
mov ecx, [esp + 8] // count
diff --git a/files/source/convert.cc b/files/source/convert.cc
index dfa83a5..614fa48 100644
--- a/files/source/convert.cc
+++ b/files/source/convert.cc
@@ -28,17 +28,17 @@
}
// Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y,
+static int I4xxToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int src_y_width,
int src_y_height,
@@ -62,21 +62,21 @@
return 0;
}
-// Copy I420 with optional flipping
+// Copy I420 with optional flipping.
// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
// is does row coalescing.
LIBYUV_API
-int I420Copy(const uint8* src_y,
+int I420Copy(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -106,20 +106,106 @@
return 0;
}
+// Copy I010 with optional flipping.
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+ height);
+ // Convert UV planes.
+ Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+ halfheight);
+ Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+ halfheight);
+ return 0;
+}
+
// 422 chroma is 1/2 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
-int I422ToI420(const uint8* src_y,
+int I422ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -129,20 +215,209 @@
dst_v, dst_stride_v, width, height, src_uv_width, height);
}
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Allocate u and v buffers
+ align_buffer_64(plane_u, halfwidth * halfheight * 2);
+ uint8_t* plane_v = plane_u + halfwidth * halfheight;
+
+ I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+ height);
+ MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+ halfwidth, halfheight);
+ free_aligned_buffer_64(plane_u);
+ return 0;
+}
+
+#ifdef I422TONV21_ROW_VERSION
+// Unittest fails for this version.
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+// Swap src_u and src_v to implement I422ToNV12
+LIBYUV_API
+int I422ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow = MergeUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ InterpolateRow = InterpolateRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ InterpolateRow = InterpolateRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height);
+ }
+ {
+ // Allocate 2 rows of vu.
+ int awidth = halfwidth * 2;
+ align_buffer_64(row_vu_0, awidth * 2);
+ uint8_t* row_vu_1 = row_vu_0 + awidth;
+
+ for (y = 0; y < height - 1; y += 2) {
+ MergeUVRow(src_v, src_u, row_vu_0, halfwidth);
+ MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1,
+ halfwidth);
+ InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128);
+ src_u += src_stride_u * 2;
+ src_v += src_stride_v * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ MergeUVRow(src_v, src_u, dst_vu, halfwidth);
+ }
+ free_aligned_buffer_64(row_vu_0);
+ }
+ return 0;
+}
+#endif // I422TONV21_ROW_VERSION
+
// 444 chroma is 1x width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
-int I444ToI420(const uint8* src_y,
+int I444ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -151,15 +426,55 @@
dst_v, dst_stride_v, width, height, width, height);
}
+// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I444ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ // Allocate u and v buffers
+ align_buffer_64(plane_u, halfwidth * halfheight * 2);
+ uint8_t* plane_v = plane_u + halfwidth * halfheight;
+
+ I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+ dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
+ height);
+ MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
+ halfwidth, halfheight);
+ free_aligned_buffer_64(plane_u);
+ return 0;
+}
+
// I400 is greyscale typically used in MJPG
LIBYUV_API
-int I400ToI420(const uint8* src_y,
+int I400ToI420(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -183,15 +498,44 @@
return 0;
}
-static void CopyPlane2(const uint8* src,
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!dst_vu || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128);
+ return 0;
+}
+
+static void CopyPlane2(const uint8_t* src,
int src_stride_0,
int src_stride_1,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
int y;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -212,11 +556,6 @@
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
// Copy plane
for (y = 0; y < height - 1; y += 2) {
@@ -239,16 +578,16 @@
// src_stride_m420 is row planar. Normally this will be the width in pixels.
// The UV plane is half width, but 2 values, so src_stride_m420 applies to
// this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
+static int X420ToI420(const uint8_t* src_y,
int src_stride_y0,
int src_stride_y1,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -303,15 +642,15 @@
// Convert NV12 to I420.
LIBYUV_API
-int NV12ToI420(const uint8* src_y,
+int NV12ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -322,15 +661,15 @@
// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
LIBYUV_API
-int NV21ToI420(const uint8* src_y,
+int NV21ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_vu,
+ const uint8_t* src_vu,
int src_stride_vu,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -341,13 +680,13 @@
// Convert M420 to I420.
LIBYUV_API
-int M420ToI420(const uint8* src_m420,
+int M420ToI420(const uint8_t* src_m420,
int src_stride_m420,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -359,20 +698,21 @@
// Convert YUY2 to I420.
LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2,
+int YUY2ToI420(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u,
- uint8* dst_v, int width) = YUY2ToUVRow_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
YUY2ToYRow_C;
// Negative height means invert the image.
if (height < 0) {
@@ -420,6 +760,18 @@
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUVRow = YUY2ToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_MMI;
+ }
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -439,20 +791,21 @@
// Convert UYVY to I420.
LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy,
+int UYVYToI420(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u,
- uint8* dst_v, int width) = UYVYToUVRow_C;
- void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
UYVYToYRow_C;
// Negative height means invert the image.
if (height < 0) {
@@ -500,6 +853,16 @@
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUVRow = UYVYToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUVRow = UYVYToUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -517,22 +880,161 @@
return 0;
}
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToUVRow = AYUVToUVRow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToUVRow = AYUVToUVRow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToUVRow = AYUVToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToUVRow = AYUVToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ int y;
+ void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+ uint8_t* dst_vu, int width) = AYUVToVURow_C;
+ void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+ AYUVToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+ src_stride_ayuv = -src_stride_ayuv;
+ }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ AYUVToVURow = AYUVToVURow_Any_SSE2;
+ AYUVToYRow = AYUVToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToVURow = AYUVToVURow_SSE2;
+ AYUVToYRow = AYUVToYRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ AYUVToVURow = AYUVToVURow_Any_AVX2;
+ AYUVToYRow = AYUVToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ AYUVToVURow = AYUVToVURow_AVX2;
+ AYUVToYRow = AYUVToYRow_AVX2;
+ }
+ }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ AYUVToYRow = AYUVToYRow_Any_NEON;
+ AYUVToVURow = AYUVToVURow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ AYUVToYRow = AYUVToYRow_NEON;
+ AYUVToVURow = AYUVToVURow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+ src_ayuv += src_stride_ayuv * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ AYUVToVURow(src_ayuv, 0, dst_vu, width);
+ AYUVToYRow(src_ayuv, dst_y, width);
+ }
+ return 0;
+}
+
// Convert ARGB to I420.
LIBYUV_API
-int ARGBToI420(const uint8* src_argb,
+int ARGBToI420(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -579,14 +1081,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -595,14 +1089,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVRow = ARGBToUVRow_Any_MSA;
@@ -611,6 +1097,22 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -630,20 +1132,21 @@
// Convert BGRA to I420.
LIBYUV_API
-int BGRAToI420(const uint8* src_bgra,
+int BGRAToI420(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra, uint8* dst_u,
- uint8* dst_v, int width) = BGRAToUVRow_C;
- void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+ void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
BGRAToYRow_C;
if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -680,22 +1183,6 @@
}
}
#endif
-#if defined(HAS_BGRATOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- BGRAToYRow = BGRAToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- BGRAToYRow = BGRAToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_BGRATOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- BGRAToUVRow = BGRAToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- BGRAToUVRow = BGRAToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_BGRATOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
BGRAToYRow = BGRAToYRow_Any_MSA;
@@ -712,6 +1199,22 @@
}
}
#endif
+#if defined(HAS_BGRATOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BGRAToYRow = BGRAToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_BGRATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BGRAToUVRow = BGRAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
@@ -731,20 +1234,21 @@
// Convert ABGR to I420.
LIBYUV_API
-int ABGRToI420(const uint8* src_abgr,
+int ABGRToI420(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr, uint8* dst_u,
- uint8* dst_v, int width) = ABGRToUVRow_C;
- void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
ABGRToYRow_C;
if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -781,22 +1285,6 @@
}
}
#endif
-#if defined(HAS_ABGRTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ABGRToYRow = ABGRToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ABGRToYRow = ABGRToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_ABGRTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ABGRToUVRow = ABGRToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ABGRToUVRow = ABGRToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ABGRTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@@ -813,6 +1301,22 @@
}
}
#endif
+#if defined(HAS_ABGRTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -832,20 +1336,21 @@
// Convert RGBA to I420.
LIBYUV_API
-int RGBAToI420(const uint8* src_rgba,
+int RGBAToI420(const uint8_t* src_rgba,
int src_stride_rgba,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba, uint8* dst_u,
- uint8* dst_v, int width) = RGBAToUVRow_C;
- void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+ void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
RGBAToYRow_C;
if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -882,22 +1387,6 @@
}
}
#endif
-#if defined(HAS_RGBATOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RGBAToYRow = RGBAToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- RGBAToYRow = RGBAToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_RGBATOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RGBAToUVRow = RGBAToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- RGBAToUVRow = RGBAToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_RGBATOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYRow = RGBAToYRow_Any_MSA;
@@ -914,6 +1403,22 @@
}
}
#endif
+#if defined(HAS_RGBATOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToYRow = RGBAToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_RGBATOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGBAToUVRow = RGBAToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -933,28 +1438,31 @@
// Convert RGB24 to I420.
LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24,
+int RGB24ToI420(const uint8_t* src_rgb24,
int src_stride_rgb24,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
- void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
- void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
RGB24ToYRow_C;
#else
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RGB24ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
#endif
if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -988,6 +1496,17 @@
RGB24ToUVRow = RGB24ToUVRow_MSA;
}
}
+#elif defined(HAS_RGB24TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
+ RGB24ToYRow = RGB24ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -1021,14 +1540,16 @@
#endif
{
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -1045,7 +1566,8 @@
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
#else
@@ -1054,7 +1576,157 @@
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
+ defined(HAS_RGB24TOYROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ RGB24ToUVJRow_C;
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+ RGB24ToYJRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+ }
+ }
+ }
+#elif defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+ }
+ }
+#elif defined(HAS_RGB24TOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+ }
+ }
+ }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVJRow = ARGBToUVJRow_AVX2;
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+ RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+ ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+ ARGBToYJRow(row, dst_y, width);
+#endif
+ }
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1063,28 +1735,30 @@
// Convert RAW to I420.
LIBYUV_API
-int RAWToI420(const uint8* src_raw,
+int RAWToI420(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
- void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, uint8* dst_u,
- uint8* dst_v, int width) = RAWToUVRow_C;
- void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
+ void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
RAWToYRow_C;
#else
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RAWToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
#endif
if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -1118,6 +1792,17 @@
RAWToUVRow = RAWToUVRow_MSA;
}
}
+#elif defined(HAS_RAWTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToUVRow = RAWToUVRow_Any_MMI;
+ RAWToYRow = RAWToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -1151,14 +1836,16 @@
#endif
{
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -1175,7 +1862,8 @@
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
#else
@@ -1184,7 +1872,8 @@
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
+ defined(HAS_RAWTOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1193,29 +1882,31 @@
// Convert RGB565 to I420.
LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565,
+int RGB565ToI420(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
- void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) =
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
+ void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB565ToUVRow_C;
- void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+ void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
RGB565ToYRow_C;
#else
- void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
- RGB565ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
#endif
if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
@@ -1249,6 +1940,17 @@
RGB565ToUVRow = RGB565ToUVRow_MSA;
}
}
+#elif defined(HAS_RGB565TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_MMI;
+ RGB565ToYRow = RGB565ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from RGB565 to ARGB.
#else
#if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1287,23 +1989,17 @@
}
}
#endif
-#if defined(HAS_RGB565TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
- }
- }
-#endif
#endif
{
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1320,7 +2016,8 @@
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
RGB565ToYRow(src_rgb565, dst_y, width);
#else
@@ -1329,7 +2026,8 @@
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
+ defined(HAS_RGB565TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1338,29 +2036,31 @@
// Convert ARGB1555 to I420.
LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555,
+int ARGB1555ToI420(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
- void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) =
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
+ void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB1555ToUVRow_C;
- void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
- ARGB1555ToYRow_C;
+ void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+ int width) = ARGB1555ToYRow_C;
#else
- void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
- ARGB1555ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
#endif
if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
@@ -1395,6 +2095,17 @@
ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
}
}
+#elif defined(HAS_ARGB1555TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI;
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from ARGB1555 to ARGB.
#else
#if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1435,14 +2146,16 @@
#endif
#endif
{
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1461,7 +2174,8 @@
dst_v += dst_stride_v;
}
if (height & 1) {
-#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
ARGB1555ToYRow(src_argb1555, dst_y, width);
#else
@@ -1470,7 +2184,8 @@
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
+ defined(HAS_ARGB1555TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
@@ -1479,29 +2194,30 @@
// Convert ARGB4444 to I420.
LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444,
+int ARGB4444ToI420(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
-#if defined(HAS_ARGB4444TOYROW_NEON)
- void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) =
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
+ void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGB4444ToUVRow_C;
- void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
- ARGB4444ToYRow_C;
+ void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+ int width) = ARGB4444ToYRow_C;
#else
- void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
- ARGB4444ToARGBRow_C;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+ int width) = ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
#endif
if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
@@ -1527,6 +2243,17 @@
}
}
}
+#elif defined(HAS_ARGB4444TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI;
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_MMI;
+ }
+ }
+ }
// Other platforms do intermediate conversion from ARGB4444 to ARGB.
#else
#if defined(HAS_ARGB4444TOARGBROW_SSE2)
@@ -1585,17 +2312,29 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+ }
+#endif
#endif
{
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
@@ -1614,7 +2353,7 @@
dst_v += dst_stride_v;
}
if (height & 1) {
-#if defined(HAS_ARGB4444TOYROW_NEON)
+#if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
ARGB4444ToYRow(src_argb4444, dst_y, width);
#else
@@ -1623,16 +2362,134 @@
ARGBToYRow(row, dst_y, width);
#endif
}
-#if !defined(HAS_ARGB4444TOYROW_NEON)
+#if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI))
free_aligned_buffer_64(row);
#endif
}
return 0;
}
-static void SplitPixels(const uint8* src_u,
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+ int src_stride_rgb24,
+ uint8_t* dst_yj,
+ int dst_stride_yj,
+ int width,
+ int height) {
+ int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+ RGB24ToYJRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+ ARGBToYJRow_C;
+#endif
+ if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_NEON;
+ }
+ }
+#elif defined(HAS_RGB24TOYJROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToYJRow = RGB24ToYJRow_MSA;
+ }
+ }
+#elif defined(HAS_RGB24TOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYJRow = RGB24ToYJRow_MMI;
+ }
+ }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#endif
+
+ {
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 31) & ~31;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+ RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToYJRow(row, dst_yj, width);
+ ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_yj += dst_stride_yj * 2;
+ }
+ if (height & 1) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ RGB24ToYJRow(src_rgb24, dst_yj, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToYJRow(row, dst_yj, width);
+#endif
+ }
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+ defined(HAS_RGB24TOYJROW_MMI))
+ free_aligned_buffer_64(row);
+#endif
+ }
+ return 0;
+}
+
+static void SplitPixels(const uint8_t* src_u,
int src_pixel_stride_uv,
- uint8* dst_u,
+ uint8_t* dst_u,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -1644,18 +2501,18 @@
// Convert Android420 to I420.
LIBYUV_API
-int Android420ToI420(const uint8* src_y,
+int Android420ToI420(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -1688,14 +2545,15 @@
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
return 0;
// Split UV planes - NV21
- } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
- src_stride_u == src_stride_v) {
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
halfwidth, halfheight);
return 0;
// Split UV planes - NV12
- } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
- src_stride_u == src_stride_v) {
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
halfwidth, halfheight);
return 0;
diff --git a/files/source/convert_argb.cc b/files/source/convert_argb.cc
index 983be57..5405033 100644
--- a/files/source/convert_argb.cc
+++ b/files/source/convert_argb.cc
@@ -26,9 +26,9 @@
// Copy ARGB with optional flipping
LIBYUV_API
-int ARGBCopy(const uint8* src_argb,
+int ARGBCopy(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -47,21 +47,21 @@
return 0;
}
-// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y,
+// Convert I420 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
@@ -97,15 +97,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
- }
-#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -129,13 +120,13 @@
// Convert I420 to ARGB.
LIBYUV_API
-int I420ToARGB(const uint8* src_y,
+int I420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -146,13 +137,13 @@
// Convert I420 to ABGR.
LIBYUV_API
-int I420ToABGR(const uint8* src_y,
+int I420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -165,13 +156,13 @@
// Convert J420 to ARGB.
LIBYUV_API
-int J420ToARGB(const uint8* src_y,
+int J420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -182,13 +173,13 @@
// Convert J420 to ABGR.
LIBYUV_API
-int J420ToABGR(const uint8* src_y,
+int J420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -201,13 +192,13 @@
// Convert H420 to ARGB.
LIBYUV_API
-int H420ToARGB(const uint8* src_y,
+int H420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -218,13 +209,13 @@
// Convert H420 to ABGR.
LIBYUV_API
-int H420ToABGR(const uint8* src_y,
+int H420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -236,20 +227,20 @@
}
// Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y,
+static int I422ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
@@ -292,15 +283,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
- }
-#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -322,13 +304,13 @@
// Convert I422 to ARGB.
LIBYUV_API
-int I422ToARGB(const uint8* src_y,
+int I422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -339,13 +321,13 @@
// Convert I422 to ABGR.
LIBYUV_API
-int I422ToABGR(const uint8* src_y,
+int I422ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -358,13 +340,13 @@
// Convert J422 to ARGB.
LIBYUV_API
-int J422ToARGB(const uint8* src_y,
+int J422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -375,13 +357,13 @@
// Convert J422 to ABGR.
LIBYUV_API
-int J422ToABGR(const uint8* src_y,
+int J422ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -394,13 +376,13 @@
// Convert H422 to ARGB.
LIBYUV_API
-int H422ToARGB(const uint8* src_y,
+int H422ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -411,13 +393,13 @@
// Convert H422 to ABGR.
LIBYUV_API
-int H422ToABGR(const uint8* src_y,
+int H422ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -428,21 +410,271 @@
width, height);
}
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y,
+// Convert 10 bit YUV to ARGB with matrix
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+static int I010ToAR30Matrix(const uint16_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint16_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToAR30Row_C;
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+#if defined(HAS_I210TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToAR30Row = I210ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToAR30Row = I210ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToAR30Row = I210ToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+ src_stride_u, dst_ab30, dst_stride_ab30,
+ &kYvuH709Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+static int I010ToARGBMatrix(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+ const uint16_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I210ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I210ToARGBRow = I210ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I210ToARGBRow = I210ToARGBRow_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuI601Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_argb, dst_stride_argb,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I444ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
@@ -485,14 +717,6 @@
}
}
#endif
-#if defined(HAS_I444TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- I444ToARGBRow = I444ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- I444ToARGBRow = I444ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_I444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444ToARGBRow = I444ToARGBRow_Any_MSA;
@@ -514,13 +738,13 @@
// Convert I444 to ARGB.
LIBYUV_API
-int I444ToARGB(const uint8* src_y,
+int I444ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -531,13 +755,13 @@
// Convert I444 to ABGR.
LIBYUV_API
-int I444ToABGR(const uint8* src_y,
+int I444ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -550,13 +774,13 @@
// Convert J444 to ARGB.
LIBYUV_API
-int J444ToARGB(const uint8* src_y,
+int J444ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -566,28 +790,28 @@
}
// Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y,
+static int I420AlphaToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- const uint8* src_a,
+ const uint8_t* src_a,
int src_stride_a,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height,
int attenuate) {
int y;
- void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, const uint8* a_buf,
- uint8* dst_argb,
+ void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) = I422AlphaToARGBRow_C;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
- ARGBAttenuateRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -621,15 +845,6 @@
}
}
#endif
-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
- }
-#endif
#if defined(HAS_I422ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
@@ -670,6 +885,14 @@
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -690,15 +913,15 @@
// Convert I420 with Alpha to ARGB.
LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y,
+int I420AlphaToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- const uint8* src_a,
+ const uint8_t* src_a,
int src_stride_a,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
@@ -711,15 +934,15 @@
// Convert I420 with Alpha to ABGR.
LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y,
+int I420AlphaToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- const uint8* src_a,
+ const uint8_t* src_a,
int src_stride_a,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height,
@@ -733,14 +956,14 @@
// Convert I400 to ARGB.
LIBYUV_API
-int I400ToARGB(const uint8* src_y,
+int I400ToARGB(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) =
+ void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
I400ToARGBRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -789,6 +1012,14 @@
}
}
#endif
+#if defined(HAS_I400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I400ToARGBRow = I400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I400ToARGBRow(src_y, dst_argb, width);
@@ -800,14 +1031,14 @@
// Convert J400 to ARGB.
LIBYUV_API
-int J400ToARGB(const uint8* src_y,
+int J400ToARGB(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
J400ToARGBRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -856,6 +1087,14 @@
}
}
#endif
+#if defined(HAS_J400TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ J400ToARGBRow = J400ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ J400ToARGBRow = J400ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
J400ToARGBRow(src_y, dst_argb, width);
src_y += src_stride_y;
@@ -865,87 +1104,87 @@
}
// Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u,
- 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
// Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u,
- 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
// Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u,
- 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
+static const uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
// Convert BGRA to ARGB.
LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra,
+int BGRAToARGB(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
+ (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
}
// Convert ARGB to BGRA (same as BGRAToARGB).
LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra,
+int ARGBToBGRA(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskBGRAToARGB), width, height);
+ (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
}
// Convert ABGR to ARGB.
LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr,
+int ABGRToARGB(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
+ (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
}
// Convert ARGB to ABGR to (same as ABGRToARGB).
LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr,
+int ARGBToABGR(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskABGRToARGB), width, height);
+ (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
}
// Convert RGBA to ARGB.
LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba,
+int RGBAToARGB(const uint8_t* src_rgba,
int src_stride_rgba,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
- (const uint8*)(&kShuffleMaskRGBAToARGB), width, height);
+ (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
}
// Convert RGB24 to ARGB.
LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24,
+int RGB24ToARGB(const uint8_t* src_rgb24,
int src_stride_rgb24,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RGB24ToARGBRow_C;
if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -978,14 +1217,6 @@
}
}
#endif
-#if defined(HAS_RGB24TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RGB24ToARGBRow = RGB24ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- RGB24ToARGBRow = RGB24ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_RGB24TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
@@ -994,6 +1225,14 @@
}
}
#endif
+#if defined(HAS_RGB24TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -1005,14 +1244,14 @@
// Convert RAW to ARGB.
LIBYUV_API
-int RAWToARGB(const uint8* src_raw,
+int RAWToARGB(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+ void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RAWToARGBRow_C;
if (!src_raw || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1045,14 +1284,6 @@
}
}
#endif
-#if defined(HAS_RAWTOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RAWToARGBRow = RAWToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- RAWToARGBRow = RAWToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_RAWTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RAWToARGBRow = RAWToARGBRow_Any_MSA;
@@ -1061,6 +1292,14 @@
}
}
#endif
+#if defined(HAS_RAWTOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToARGBRow = RAWToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToARGBRow = RAWToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToARGBRow(src_raw, dst_argb, width);
@@ -1072,15 +1311,15 @@
// Convert RGB565 to ARGB.
LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565,
+int RGB565ToARGB(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
- RGB565ToARGBRow_C;
+ void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+ int width) = RGB565ToARGBRow_C;
if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1120,14 +1359,6 @@
}
}
#endif
-#if defined(HAS_RGB565TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- RGB565ToARGBRow = RGB565ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- RGB565ToARGBRow = RGB565ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_RGB565TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
@@ -1136,6 +1367,14 @@
}
}
#endif
+#if defined(HAS_RGB565TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1147,14 +1386,14 @@
// Convert ARGB1555 to ARGB.
LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555,
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
int width) = ARGB1555ToARGBRow_C;
if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1195,14 +1434,6 @@
}
}
#endif
-#if defined(HAS_ARGB1555TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 4)) {
- ARGB1555ToARGBRow = ARGB1555ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGB1555TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
@@ -1211,6 +1442,14 @@
}
}
#endif
+#if defined(HAS_ARGB1555TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1222,14 +1461,14 @@
// Convert ARGB4444 to ARGB.
LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444,
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
int width) = ARGB4444ToARGBRow_C;
if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1270,14 +1509,6 @@
}
}
#endif
-#if defined(HAS_ARGB4444TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 4)) {
- ARGB4444ToARGBRow = ARGB4444ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGB4444TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
@@ -1286,6 +1517,14 @@
}
}
#endif
+#if defined(HAS_ARGB4444TOARGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -1295,20 +1534,116 @@
return 0;
}
+// Convert AR30 to ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_argb = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToARGBRow_C(src_ar30, dst_argb, width);
+ src_ar30 += src_stride_ar30;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_abgr,
+ int dst_stride_abgr,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_abgr = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+ src_ar30 += src_stride_ar30;
+ dst_abgr += dst_stride_abgr;
+ }
+ return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+ int src_stride_ar30,
+ uint8_t* dst_ab30,
+ int dst_stride_ab30,
+ int width,
+ int height) {
+ int y;
+ if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+ src_stride_ar30 = -src_stride_ar30;
+ }
+ // Coalesce rows.
+ if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_ar30 = dst_stride_ab30 = 0;
+ }
+ for (y = 0; y < height; ++y) {
+ AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+ src_ar30 += src_stride_ar30;
+ dst_ab30 += dst_stride_ab30;
+ }
+ return 0;
+}
+
// Convert NV12 to ARGB with matrix
-static int NV12ToARGBMatrix(const uint8* src_y,
+static int NV12ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- NV12ToARGBRow_C;
+ void (*NV12ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1342,14 +1677,6 @@
}
}
#endif
-#if defined(HAS_NV12TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_NV12TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
@@ -1371,20 +1698,20 @@
}
// Convert NV21 to ARGB with matrix
-static int NV21ToARGBMatrix(const uint8* src_y,
+static int NV21ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
- int src_stride_uv,
- uint8* dst_argb,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*NV21ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- NV21ToARGBRow_C;
- if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
+ void (*NV21ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+ if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1427,11 +1754,11 @@
#endif
for (y = 0; y < height; ++y) {
- NV21ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
+ NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
dst_argb += dst_stride_argb;
src_y += src_stride_y;
if (y & 1) {
- src_uv += src_stride_uv;
+ src_vu += src_stride_vu;
}
}
return 0;
@@ -1439,11 +1766,11 @@
// Convert NV12 to ARGB.
LIBYUV_API
-int NV12ToARGB(const uint8* src_y,
+int NV12ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -1453,15 +1780,15 @@
// Convert NV21 to ARGB.
LIBYUV_API
-int NV21ToARGB(const uint8* src_y,
+int NV21ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
- int src_stride_uv,
- uint8* dst_argb,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+ return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
dst_stride_argb, &kYuvI601Constants, width, height);
}
@@ -1469,11 +1796,11 @@
// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
// To swap the UV use NV12 instead of NV21.LIBYUV_API
LIBYUV_API
-int NV12ToABGR(const uint8* src_y,
+int NV12ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -1483,11 +1810,11 @@
// Convert NV21 to ABGR.
LIBYUV_API
-int NV21ToABGR(const uint8* src_y,
+int NV21ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_vu,
+ const uint8_t* src_vu,
int src_stride_vu,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
@@ -1495,18 +1822,243 @@
dst_stride_abgr, &kYvuI601Constants, width, height);
}
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix
+static int NV12ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV12ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+ if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to RGB24 with matrix
+static int NV21ToRGB24Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToRGB24Row)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+ if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+ dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+ width, height);
+}
+
+// Convert NV12 to RAW.
+LIBYUV_API
+int NV12ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_uv,
+ int src_stride_uv,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to RAW.
+LIBYUV_API
+int NV21ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
+ dst_stride_raw, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_yuv24,
+ int dst_stride_yuv24,
+ int width,
+ int height) {
+ int y;
+ void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+ uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+ if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+ dst_stride_yuv24 = -dst_stride_yuv24;
+ }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+ }
+ }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+ dst_yuv24 += dst_stride_yuv24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
// Convert M420 to ARGB.
LIBYUV_API
-int M420ToARGB(const uint8* src_m420,
+int M420ToARGB(const uint8_t* src_m420,
int src_stride_m420,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*NV12ToARGBRow)(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
- const struct YuvConstants* yuvconstants, int width) =
- NV12ToARGBRow_C;
+ void (*NV12ToARGBRow)(
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1540,14 +2092,6 @@
}
}
#endif
-#if defined(HAS_NV12TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- NV12ToARGBRow = NV12ToARGBRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- NV12ToARGBRow = NV12ToARGBRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_NV12TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
@@ -1574,14 +2118,14 @@
// Convert YUY2 to ARGB.
LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2,
+int YUY2ToARGB(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb,
+ void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, int width) =
YUY2ToARGBRow_C;
if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
@@ -1641,14 +2185,14 @@
// Convert UYVY to ARGB.
LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy,
+int UYVYToARGB(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb,
+ void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, int width) =
UYVYToARGBRow_C;
if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
@@ -1705,10 +2249,10 @@
}
return 0;
}
-static void WeavePixels(const uint8* src_u,
- const uint8* src_v,
+static void WeavePixels(const uint8_t* src_u,
+ const uint8_t* src_v,
int src_pixel_stride_uv,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -1722,20 +2266,20 @@
// Convert Android420 to ARGB.
LIBYUV_API
-int Android420ToARGBMatrix(const uint8* src_y,
+int Android420ToARGBMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- uint8* dst_uv;
+ uint8_t* dst_uv;
const ptrdiff_t vu_off = src_v - src_u;
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
@@ -1756,13 +2300,14 @@
src_stride_v, dst_argb, dst_stride_argb,
yuvconstants, width, height);
// NV21
- } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
- src_stride_u == src_stride_v) {
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+ src_stride_u == src_stride_v) {
return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
dst_stride_argb, yuvconstants, width, height);
// NV12
- } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
- src_stride_u == src_stride_v) {
+ }
+ if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
dst_stride_argb, yuvconstants, width, height);
}
@@ -1784,14 +2329,14 @@
// Convert Android420 to ARGB.
LIBYUV_API
-int Android420ToARGB(const uint8* src_y,
+int Android420ToARGB(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
@@ -1803,14 +2348,14 @@
// Convert Android420 to ABGR.
LIBYUV_API
-int Android420ToABGR(const uint8* src_y,
+int Android420ToABGR(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_pixel_stride_uv,
- uint8* dst_abgr,
+ uint8_t* dst_abgr,
int dst_stride_abgr,
int width,
int height) {
diff --git a/files/source/convert_from.cc b/files/source/convert_from.cc
index e0ebfb0..60140cb 100644
--- a/files/source/convert_from.cc
+++ b/files/source/convert_from.cc
@@ -30,17 +30,17 @@
}
// I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y,
+static int I420ToI4xx(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int src_y_width,
int src_y_height,
@@ -65,20 +65,64 @@
return 0;
}
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Convert Y plane.
+ Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+ height);
+ // Convert UV planes.
+ Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+ halfheight);
+ Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+ halfheight);
+ return 0;
+}
+
// 420 chroma is 1/2 width, 1/2 height
// 422 chroma is 1/2 width, 1x height
LIBYUV_API
-int I420ToI422(const uint8* src_y,
+int I420ToI422(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -93,17 +137,17 @@
// 420 chroma is 1/2 width, 1/2 height
// 444 chroma is 1x width, 1x height
LIBYUV_API
-int I420ToI444(const uint8* src_y,
+int I420ToI444(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -117,9 +161,9 @@
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
-int I400Copy(const uint8* src_y,
+int I400Copy(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
@@ -137,19 +181,19 @@
}
LIBYUV_API
-int I422ToYUY2(const uint8* src_y,
+int I422ToYUY2(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_yuy2,
+ uint8_t* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) =
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
@@ -175,6 +219,14 @@
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -195,19 +247,19 @@
}
LIBYUV_API
-int I420ToYUY2(const uint8* src_y,
+int I420ToYUY2(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_yuy2,
+ uint8_t* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) =
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
return -1;
@@ -226,6 +278,14 @@
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -242,6 +302,14 @@
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -259,19 +327,19 @@
}
LIBYUV_API
-int I422ToUYVY(const uint8* src_y,
+int I422ToUYVY(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_uyvy,
+ uint8_t* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) =
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
@@ -297,6 +365,14 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -313,6 +389,14 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -325,19 +409,19 @@
}
LIBYUV_API
-int I420ToUYVY(const uint8* src_y,
+int I420ToUYVY(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_uyvy,
+ uint8_t* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) =
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
return -1;
@@ -356,6 +440,14 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -372,6 +464,14 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -390,15 +490,15 @@
// TODO(fbarchard): test negative height for invert.
LIBYUV_API
-int I420ToNV12(const uint8* src_y,
+int I420ToNV12(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
@@ -417,15 +517,15 @@
}
LIBYUV_API
-int I420ToNV21(const uint8* src_y,
+int I420ToNV21(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_vu,
+ uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height) {
@@ -435,20 +535,20 @@
}
// Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y,
+static int I420ToRGBAMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
@@ -484,15 +584,6 @@
}
}
#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
- I422ToRGBARow = I422ToRGBARow_DSPR2;
- }
-#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
@@ -516,13 +607,13 @@
// Convert I420 to RGBA.
LIBYUV_API
-int I420ToRGBA(const uint8* src_y,
+int I420ToRGBA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
@@ -533,13 +624,13 @@
// Convert I420 to BGRA.
LIBYUV_API
-int I420ToBGRA(const uint8* src_y,
+int I420ToBGRA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_bgra,
+ uint8_t* dst_bgra,
int dst_stride_bgra,
int width,
int height) {
@@ -551,20 +642,20 @@
}
// Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y,
+static int I420ToRGB24Matrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb24,
+ uint8_t* dst_rgb24,
int dst_stride_rgb24,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I422ToRGB24Row)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGB24Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
@@ -579,7 +670,7 @@
#if defined(HAS_I422TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
+ if (IS_ALIGNED(width, 16)) {
I422ToRGB24Row = I422ToRGB24Row_SSSE3;
}
}
@@ -587,7 +678,7 @@
#if defined(HAS_I422TORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToRGB24Row = I422ToRGB24Row_Any_AVX2;
- if (IS_ALIGNED(width, 16)) {
+ if (IS_ALIGNED(width, 32)) {
I422ToRGB24Row = I422ToRGB24Row_AVX2;
}
}
@@ -623,13 +714,13 @@
// Convert I420 to RGB24.
LIBYUV_API
-int I420ToRGB24(const uint8* src_y,
+int I420ToRGB24(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb24,
+ uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
@@ -640,13 +731,13 @@
// Convert I420 to RAW.
LIBYUV_API
-int I420ToRAW(const uint8* src_y,
+int I420ToRAW(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_raw,
+ uint8_t* dst_raw,
int dst_stride_raw,
int width,
int height) {
@@ -657,21 +748,57 @@
width, height);
}
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb24,
+ int dst_stride_rgb24,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb24, dst_stride_rgb24,
+ &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_raw,
+ int dst_stride_raw,
+ int width,
+ int height) {
+ return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+ src_stride_v, // Swap U and V
+ src_u, src_stride_u, dst_raw, dst_stride_raw,
+ &kYvuH709Constants, // Use Yvu matrix
+ width, height);
+}
+
// Convert I420 to ARGB1555.
LIBYUV_API
-int I420ToARGB1555(const uint8* src_y,
+int I420ToARGB1555(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb1555,
+ uint8_t* dst_argb1555,
int dst_stride_argb1555,
int width,
int height) {
int y;
- void (*I422ToARGB1555Row)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB1555Row_C;
if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
@@ -708,14 +835,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGB1555ROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- I422ToARGB1555Row = I422ToARGB1555Row_Any_DSPR2;
- if (IS_ALIGNED(width, 4)) {
- I422ToARGB1555Row = I422ToARGB1555Row_DSPR2;
- }
- }
-#endif
#if defined(HAS_I422TOARGB1555ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
@@ -740,19 +859,19 @@
// Convert I420 to ARGB4444.
LIBYUV_API
-int I420ToARGB4444(const uint8* src_y,
+int I420ToARGB4444(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_argb4444,
+ uint8_t* dst_argb4444,
int dst_stride_argb4444,
int width,
int height) {
int y;
- void (*I422ToARGB4444Row)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) = I422ToARGB4444Row_C;
if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
@@ -789,14 +908,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGB4444ROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- I422ToARGB4444Row = I422ToARGB4444Row_Any_DSPR2;
- if (IS_ALIGNED(width, 4)) {
- I422ToARGB4444Row = I422ToARGB4444Row_DSPR2;
- }
- }
-#endif
#if defined(HAS_I422TOARGB4444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
@@ -821,20 +932,20 @@
// Convert I420 to RGB565 with specified color matrix.
LIBYUV_API
-int I420ToRGB565Matrix(const uint8* src_y,
- int src_stride_y,
- const uint8* src_u,
- int src_stride_u,
- const uint8* src_v,
- int src_stride_v,
- uint8* dst_rgb565,
- int dst_stride_rgb565,
- const struct YuvConstants* yuvconstants,
- int width,
- int height) {
+int I420ToRGB565Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_rgb565,
+ int dst_stride_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
int y;
- void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
@@ -893,94 +1004,70 @@
// Convert I420 to RGB565.
LIBYUV_API
-int I420ToRGB565(const uint8* src_y,
+int I420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
- return I420ToRGB565Matrix(src_y,
- src_stride_y,
- src_u,
- src_stride_u,
- src_v,
- src_stride_v,
- dst_rgb565,
- dst_stride_rgb565,
- &kYuvI601Constants,
- width,
- height);
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvI601Constants, width, height);
}
// Convert J420 to RGB565.
LIBYUV_API
-int J420ToRGB565(const uint8* src_y,
+int J420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
- return I420ToRGB565Matrix(src_y,
- src_stride_y,
- src_u,
- src_stride_u,
- src_v,
- src_stride_v,
- dst_rgb565,
- dst_stride_rgb565,
- &kYuvJPEGConstants,
- width,
- height);
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvJPEGConstants, width, height);
}
// Convert H420 to RGB565.
LIBYUV_API
-int H420ToRGB565(const uint8* src_y,
+int H420ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
- return I420ToRGB565Matrix(src_y,
- src_stride_y,
- src_u,
- src_stride_u,
- src_v,
- src_stride_v,
- dst_rgb565,
- dst_stride_rgb565,
- &kYuvH709Constants,
- width,
- height);
+ return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_rgb565, dst_stride_rgb565,
+ &kYuvH709Constants, width, height);
}
// Convert I422 to RGB565.
LIBYUV_API
-int I422ToRGB565(const uint8* src_y,
+int I422ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
int y;
- void (*I422ToRGB565Row)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGB565Row_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
@@ -1036,30 +1123,30 @@
}
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
+static const uint8_t kDither565_4x4[16] = {
0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert I420 to RGB565 with dithering.
LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y,
+int I420ToRGB565Dither(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
- const uint8* dither4x4,
+ const uint8_t* dither4x4,
int width,
int height) {
int y;
- void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C;
- void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) =
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
ARGBToRGB565DitherRow_C;
if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -1097,14 +1184,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
- }
-#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -1151,8 +1230,8 @@
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
ARGBToRGB565DitherRow(row_argb, dst_rgb565,
- *(uint32*)(dither4x4 + ((y & 3) << 2)),
- width); // NOLINT
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
dst_rgb565 += dst_stride_rgb565;
src_y += src_stride_y;
if (y & 1) {
@@ -1165,20 +1244,111 @@
return 0;
}
+// Convert I420 to AR30 with matrix
+static int I420ToAR30Matrix(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width,
+ int height) {
+ int y;
+ void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants, int width) =
+ I422ToAR30Row_C;
+
+ if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+ dst_stride_ar30 = -dst_stride_ar30;
+ }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToAR30Row = I422ToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToAR30Row = I422ToAR30Row_AVX2;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+ dst_ar30 += dst_stride_ar30;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+ src_stride_v, dst_ar30, dst_stride_ar30,
+ &kYvuH709Constants, width, height);
+}
+
// Convert I420 to specified format
LIBYUV_API
-int ConvertFromI420(const uint8* y,
+int ConvertFromI420(const uint8_t* y,
int y_stride,
- const uint8* u,
+ const uint8_t* u,
int u_stride,
- const uint8* v,
+ const uint8_t* v,
int v_stride,
- uint8* dst_sample,
+ uint8_t* dst_sample,
int dst_sample_stride,
int width,
int height,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
int r = 0;
if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
return -1;
@@ -1240,13 +1410,18 @@
dst_sample_stride ? dst_sample_stride : width * 4, width,
height);
break;
+ case FOURCC_AR30:
+ r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4, width,
+ height);
+ break;
case FOURCC_I400:
r = I400Copy(y, y_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, width,
height);
break;
case FOURCC_NV12: {
- uint8* dst_uv = dst_sample + width * height;
+ uint8_t* dst_uv = dst_sample + width * height;
r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_uv,
dst_sample_stride ? dst_sample_stride : width, width,
@@ -1254,7 +1429,7 @@
break;
}
case FOURCC_NV21: {
- uint8* dst_vu = dst_sample + width * height;
+ uint8_t* dst_vu = dst_sample + width * height;
r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
dst_sample_stride ? dst_sample_stride : width, dst_vu,
dst_sample_stride ? dst_sample_stride : width, width,
@@ -1268,8 +1443,8 @@
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
int halfstride = (dst_sample_stride + 1) / 2;
int halfheight = (height + 1) / 2;
- uint8* dst_u;
- uint8* dst_v;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
if (format == FOURCC_YV12) {
dst_v = dst_sample + dst_sample_stride * height;
dst_u = dst_v + halfstride * halfheight;
@@ -1286,8 +1461,8 @@
case FOURCC_YV16: {
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
int halfstride = (dst_sample_stride + 1) / 2;
- uint8* dst_u;
- uint8* dst_v;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
if (format == FOURCC_YV16) {
dst_v = dst_sample + dst_sample_stride * height;
dst_u = dst_v + halfstride * height;
@@ -1303,8 +1478,8 @@
case FOURCC_I444:
case FOURCC_YV24: {
dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
- uint8* dst_u;
- uint8* dst_v;
+ uint8_t* dst_u;
+ uint8_t* dst_v;
if (format == FOURCC_YV24) {
dst_v = dst_sample + dst_sample_stride * height;
dst_u = dst_v + dst_sample_stride * height;
diff --git a/files/source/convert_from_argb.cc b/files/source/convert_from_argb.cc
index 88f3827..fbcd039 100644
--- a/files/source/convert_from_argb.cc
+++ b/files/source/convert_from_argb.cc
@@ -22,21 +22,21 @@
// ARGB little endian (bgra in memory) to I444
LIBYUV_API
-int ARGBToI444(const uint8* src_argb,
+int ARGBToI444(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
- void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) = ARGBToUV444Row_C;
+ void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = ARGBToUV444Row_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
}
@@ -76,6 +76,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUV444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToUV444Row = ARGBToUV444Row_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -100,14 +108,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -116,6 +116,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -130,20 +138,21 @@
// ARGB little endian (bgra in memory) to I422
LIBYUV_API
-int ARGBToI422(const uint8* src_argb,
+int ARGBToI422(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -197,22 +206,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
@@ -231,6 +224,23 @@
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width);
@@ -243,22 +253,23 @@
}
LIBYUV_API
-int ARGBToNV12(const uint8* src_argb,
+int ARGBToNV12(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
@@ -320,6 +331,22 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -344,22 +371,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow_ = MergeUVRow_Any_MSA;
@@ -368,10 +379,18 @@
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
- uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -394,23 +413,24 @@
// Same as NV12 but U and V swapped.
LIBYUV_API
-int ARGBToNV21(const uint8* src_argb,
+int ARGBToNV21(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
- int dst_stride_uv,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
- void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_vu, int width) = MergeUVRow_C;
+ if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -471,6 +491,23 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
+
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -495,19 +532,162 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
+#if defined(HAS_MERGEUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ MergeUVRow_ = MergeUVRow_Any_MSA;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_MSA;
}
}
#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
+ {
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_vu += dst_stride_vu;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+ ABGRToYRow_C;
+ void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
+ if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+ ABGRToYRow = ABGRToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_AVX2;
+ ABGRToYRow = ABGRToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToYRow = ABGRToYRow_Any_MSA;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToYRow = ABGRToYRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ABGRToUVRow = ABGRToUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ ABGRToUVRow = ABGRToUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToYRow = ABGRToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ABGRToUVRow = ABGRToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
}
}
#endif
@@ -519,24 +699,32 @@
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow_ = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ MergeUVRow_ = MergeUVRow_MMI;
+ }
+ }
+#endif
{
// Allocate a rows of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
- uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+ uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
for (y = 0; y < height - 1; y += 2) {
- ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
- ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
- src_argb += src_stride_argb * 2;
+ ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
- ARGBToUVRow(src_argb, 0, row_u, row_v, width);
- MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
- ARGBToYRow(src_argb, dst_y, width);
+ ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ABGRToYRow(src_abgr, dst_y, width);
}
free_aligned_buffer_64(row_u);
}
@@ -545,19 +733,20 @@
// Convert ARGB to YUY2.
LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb,
+int ARGBToYUY2(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yuy2,
+ uint8_t* dst_yuy2,
int dst_stride_yuy2,
int width,
int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
- void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_yuy2, int width) =
+ void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
I422ToYUY2Row_C;
if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
@@ -627,6 +816,22 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -635,6 +840,14 @@
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToYUY2Row = I422ToYUY2Row_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOYUY2ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -643,22 +856,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_I422TOYUY2ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
@@ -667,12 +864,20 @@
}
}
#endif
+#if defined(HAS_I422TOYUY2ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToYUY2Row = I422ToYUY2Row_MMI;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ uint8_t* row_u = row_y + ((width + 63) & ~63);
+ uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -689,19 +894,20 @@
// Convert ARGB to UYVY.
LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb,
+int ARGBToUYVY(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_uyvy,
+ uint8_t* dst_uyvy,
int dst_stride_uyvy,
int width,
int height) {
int y;
- void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb, uint8* dst_u,
- uint8* dst_v, int width) = ARGBToUVRow_C;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
- void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
- const uint8* src_v, uint8* dst_uyvy, int width) =
+ void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+ const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
I422ToUYVYRow_C;
if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
@@ -771,6 +977,22 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVRow = ARGBToUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -779,6 +1001,14 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ I422ToUYVYRow = I422ToUYVYRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_I422TOUYVYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -787,22 +1017,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
-#if defined(HAS_ARGBTOUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToUVRow = ARGBToUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUVRow = ARGBToUVRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_I422TOUYVYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
@@ -811,12 +1025,20 @@
}
}
#endif
+#if defined(HAS_I422TOUYVYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToUYVYRow = I422ToUYVYRow_MMI;
+ }
+ }
+#endif
{
// Allocate a rows of yuv.
align_buffer_64(row_y, ((width + 63) & ~63) * 2);
- uint8* row_u = row_y + ((width + 63) & ~63);
- uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+ uint8_t* row_u = row_y + ((width + 63) & ~63);
+ uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
for (y = 0; y < height; ++y) {
ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -833,14 +1055,14 @@
// Convert ARGB to I400.
LIBYUV_API
-int ARGBToI400(const uint8* src_argb,
+int ARGBToI400(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+ void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYRow_C;
if (!src_argb || !dst_y || width <= 0 || height == 0) {
return -1;
@@ -880,14 +1102,6 @@
}
}
#endif
-#if defined(HAS_ARGBTOYROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ARGBToYRow = ARGBToYRow_Any_DSPR2;
- if (IS_ALIGNED(width, 8)) {
- ARGBToYRow = ARGBToYRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@@ -896,6 +1110,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYRow = ARGBToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYRow(src_argb, dst_y, width);
@@ -906,31 +1128,31 @@
}
// Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u,
- 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
+static const uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
// Convert ARGB to RGBA.
LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb,
+int ARGBToRGBA(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
- (const uint8*)(&kShuffleMaskARGBToRGBA), width, height);
+ (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
}
// Convert ARGB To RGB24.
LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb,
+int ARGBToRGB24(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb24,
+ uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
int y;
- void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
ARGBToRGB24Row_C;
if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
@@ -954,6 +1176,22 @@
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ if (TestCpuFlag(kCpuHasAVX512VBMI)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
@@ -970,6 +1208,14 @@
}
}
#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -981,14 +1227,14 @@
// Convert ARGB To RAW.
LIBYUV_API
-int ARGBToRAW(const uint8* src_argb,
+int ARGBToRAW(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_raw,
+ uint8_t* dst_raw,
int dst_stride_raw,
int width,
int height) {
int y;
- void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+ void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
ARGBToRAWRow_C;
if (!src_argb || !dst_raw || width <= 0 || height == 0) {
return -1;
@@ -1012,6 +1258,14 @@
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToRAWRow = ARGBToRAWRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
@@ -1028,6 +1282,14 @@
}
}
#endif
+#if defined(HAS_ARGBTORAWROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRAWRow = ARGBToRAWRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRAWRow(src_argb, dst_raw, width);
@@ -1038,22 +1300,22 @@
}
// Ordered 8x8 dither for 888 to 565. Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
+static const uint8_t kDither565_4x4[16] = {
0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb,
+int ARGBToRGB565Dither(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
- const uint8* dither4x4,
+ const uint8_t* dither4x4,
int width,
int height) {
int y;
- void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
- const uint32 dither4, int width) =
+ void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ const uint32_t dither4, int width) =
ARGBToRGB565DitherRow_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -1098,11 +1360,19 @@
}
}
#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565DitherRow(src_argb, dst_rgb565,
- *(uint32*)(dither4x4 + ((y & 3) << 2)),
- width); /* NOLINT */
+ *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+ width);
src_argb += src_stride_argb;
dst_rgb565 += dst_stride_rgb565;
}
@@ -1112,15 +1382,15 @@
// Convert ARGB To RGB565.
// TODO(fbarchard): Consider using dither function low level with zeros.
LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb,
+int ARGBToRGB565(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
int y;
- void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
- ARGBToRGB565Row_C;
+ void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToRGB565Row_C;
if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
}
@@ -1167,6 +1437,14 @@
}
}
#endif
+#if defined(HAS_ARGBTORGB565ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -1178,15 +1456,15 @@
// Convert ARGB To ARGB1555.
LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb,
+int ARGBToARGB1555(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb1555,
+ uint8_t* dst_argb1555,
int dst_stride_argb1555,
int width,
int height) {
int y;
- void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
- ARGBToARGB1555Row_C;
+ void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToARGB1555Row_C;
if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
return -1;
}
@@ -1233,6 +1511,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOARGB1555ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1244,15 +1530,15 @@
// Convert ARGB To ARGB4444.
LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb,
+int ARGBToARGB4444(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb4444,
+ uint8_t* dst_argb4444,
int dst_stride_argb4444,
int width,
int height) {
int y;
- void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
- ARGBToARGB4444Row_C;
+ void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+ int width) = ARGBToARGB4444Row_C;
if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
return -1;
}
@@ -1299,6 +1585,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOARGB4444ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1308,22 +1602,121 @@
return 0;
}
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+ int src_stride_abgr,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ int y;
+ void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+ ABGRToAR30Row_C;
+ if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ // Coalesce rows.
+ if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_abgr = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToAR30Row = ABGRToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ABGRToAR30Row(src_abgr, dst_ar30, width);
+ src_abgr += src_stride_abgr;
+ dst_ar30 += dst_stride_ar30;
+ }
+ return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_ar30,
+ int dst_stride_ar30,
+ int width,
+ int height) {
+ int y;
+ void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+ ARGBToAR30Row_C;
+ if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_ar30 = 0;
+ }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToAR30Row = ARGBToAR30Row_AVX2;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ ARGBToAR30Row(src_argb, dst_ar30, width);
+ src_argb += src_stride_argb;
+ dst_ar30 += dst_stride_ar30;
+ }
+ return 0;
+}
+
// Convert ARGB to J420. (JPeg full range I420).
LIBYUV_API
-int ARGBToJ420(const uint8* src_argb,
+int ARGBToJ420(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
ARGBToYJRow_C;
if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -1376,6 +1769,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
@@ -1384,6 +1785,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height - 1; y += 2) {
ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1403,20 +1812,21 @@
// Convert ARGB to J422. (JPeg full range I422).
LIBYUV_API
-int ARGBToJ422(const uint8* src_argb,
+int ARGBToJ422(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+ uint8_t* dst_u, uint8_t* dst_v, int width) =
+ ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
ARGBToYJRow_C;
if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -1476,6 +1886,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
@@ -1484,6 +1902,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOUVJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
@@ -1498,14 +1924,14 @@
// Convert ARGB to J400.
LIBYUV_API
-int ARGBToJ400(const uint8* src_argb,
+int ARGBToJ400(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_yj,
+ uint8_t* dst_yj,
int dst_stride_yj,
int width,
int height) {
int y;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
ARGBToYJRow_C;
if (!src_argb || !dst_yj || width <= 0 || height == 0) {
return -1;
@@ -1553,6 +1979,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBToYJRow(src_argb, dst_yj, width);
diff --git a/files/source/convert_jpeg.cc b/files/source/convert_jpeg.cc
index 216a9f2..f440c7c 100644
--- a/files/source/convert_jpeg.cc
+++ b/files/source/convert_jpeg.cc
@@ -22,18 +22,18 @@
#ifdef HAVE_JPEG
struct I420Buffers {
- uint8* y;
+ uint8_t* y;
int y_stride;
- uint8* u;
+ uint8_t* u;
int u_stride;
- uint8* v;
+ uint8_t* v;
int v_stride;
int w;
int h;
};
static void JpegCopyI420(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
@@ -47,7 +47,7 @@
}
static void JpegI422ToI420(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
@@ -61,7 +61,7 @@
}
static void JpegI444ToI420(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
@@ -75,7 +75,7 @@
}
static void JpegI400ToI420(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
I420Buffers* dest = (I420Buffers*)(opaque);
@@ -89,9 +89,12 @@
// Query size of MJPG in pixels.
LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size, int* width, int* height) {
+int MJPGSize(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ int* width,
+ int* height) {
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
if (ret) {
*width = mjpeg_decoder.GetWidth();
*height = mjpeg_decoder.GetHeight();
@@ -101,36 +104,38 @@
}
// MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
LIBYUV_API
-int MJPGToI420(const uint8* sample,
- size_t sample_size,
- uint8* y,
- int y_stride,
- uint8* u,
- int u_stride,
- uint8* v,
- int v_stride,
- int w,
- int h,
- int dw,
- int dh) {
- if (sample_size == kUnknownDataSize) {
+int MJPGToI420(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret &&
- (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
- I420Buffers bufs = {y, y_stride, u, u_stride, v, v_stride, dw, dh};
+ I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, dst_width, dst_height};
// YUV420
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -140,7 +145,8 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+ dst_height);
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
@@ -151,7 +157,8 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+ dst_height);
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
@@ -162,18 +169,20 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+ dst_height);
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+ dst_height);
} else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice.
- // ERROR: Unable to convert MJPEG frame because format is not supported
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
}
@@ -181,16 +190,153 @@
return ret ? 0 : 1;
}
-#ifdef HAVE_JPEG
+struct NV21Buffers {
+ uint8_t* y;
+ int y_stride;
+ uint8_t* vu;
+ int vu_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+ dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToNV21(void* opaque,
+ const uint8_t* const* data,
+ const int* strides,
+ int rows) {
+ NV21Buffers* dest = (NV21Buffers*)(opaque);
+ I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu,
+ dest->vu_stride, dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->vu += ((rows + 1) >> 1) * dest->vu_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to NV21
+LIBYUV_API
+int MJPGToNV21(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, dst_width, dst_height};
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width,
+ dst_height);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width,
+ dst_height);
+ } else {
+ // Unknown colorspace.
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
struct ARGBBuffers {
- uint8* argb;
+ uint8_t* argb;
int argb_stride;
int w;
int h;
};
static void JpegI420ToARGB(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
@@ -201,7 +347,7 @@
}
static void JpegI422ToARGB(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
@@ -212,7 +358,7 @@
}
static void JpegI444ToARGB(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
@@ -223,7 +369,7 @@
}
static void JpegI400ToARGB(void* opaque,
- const uint8* const* data,
+ const uint8_t* const* data,
const int* strides,
int rows) {
ARGBBuffers* dest = (ARGBBuffers*)(opaque);
@@ -233,32 +379,33 @@
}
// MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
LIBYUV_API
-int MJPGToARGB(const uint8* sample,
- size_t sample_size,
- uint8* argb,
- int argb_stride,
- int w,
- int h,
- int dw,
- int dh) {
- if (sample_size == kUnknownDataSize) {
+int MJPGToARGB(const uint8_t* src_mjpg,
+ size_t src_size_mjpg,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
+ int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height) {
+ if (src_size_mjpg == kUnknownDataSize) {
// ERROR: MJPEG frame size unknown
return -1;
}
// TODO(fbarchard): Port MJpeg to C.
MJpegDecoder mjpeg_decoder;
- LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
- if (ret &&
- (mjpeg_decoder.GetWidth() != w || mjpeg_decoder.GetHeight() != h)) {
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg);
+ if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+ mjpeg_decoder.GetHeight() != src_height)) {
// ERROR: MJPEG frame has unexpected dimensions
mjpeg_decoder.UnloadFrame();
return 1; // runtime failure
}
if (ret) {
- ARGBBuffers bufs = {argb, argb_stride, dw, dh};
+ ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
// YUV420
if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
mjpeg_decoder.GetNumComponents() == 3 &&
@@ -268,7 +415,8 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+ dst_height);
// YUV422
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
@@ -279,7 +427,8 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+ dst_height);
// YUV444
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceYCbCr &&
@@ -290,27 +439,28 @@
mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
mjpeg_decoder.GetVertSampFactor(2) == 1 &&
mjpeg_decoder.GetHorizSampFactor(2) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+ dst_height);
// YUV400
} else if (mjpeg_decoder.GetColorSpace() ==
MJpegDecoder::kColorSpaceGrayscale &&
mjpeg_decoder.GetNumComponents() == 1 &&
mjpeg_decoder.GetVertSampFactor(0) == 1 &&
mjpeg_decoder.GetHorizSampFactor(0) == 1) {
- ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+ dst_height);
} else {
- // TODO(fbarchard): Implement conversion for any other colorspace/sample
- // factors that occur in practice.
- // ERROR: Unable to convert MJPEG frame because format is not supported
+ // TODO(fbarchard): Implement conversion for any other
+ // colorspace/subsample factors that occur in practice. ERROR: Unable to
+ // convert MJPEG frame because format is not supported
mjpeg_decoder.UnloadFrame();
return 1;
}
}
return ret ? 0 : 1;
}
-#endif
-#endif
+#endif // HAVE_JPEG
#ifdef __cplusplus
} // extern "C"
diff --git a/files/source/convert_to_argb.cc b/files/source/convert_to_argb.cc
index 63a5104..bde1aa8 100644
--- a/files/source/convert_to_argb.cc
+++ b/files/source/convert_to_argb.cc
@@ -28,11 +28,19 @@
// src_height is used to compute location of planes, and indicate inversion
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
+
+// TODO(fbarchard): Add the following:
+// H010ToARGB
+// I010ToARGB
+// J400ToARGB
+// J422ToARGB
+// J444ToARGB
+
LIBYUV_API
-int ConvertToARGB(const uint8* sample,
+int ConvertToARGB(const uint8_t* sample,
size_t sample_size,
- uint8* crop_argb,
- int argb_stride,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
int crop_x,
int crop_y,
int src_width,
@@ -40,11 +48,11 @@
int crop_width,
int crop_height,
enum RotationMode rotation,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
+ const uint8_t* src;
+ const uint8_t* src_uv;
int abs_src_height = (src_height < 0) ? -src_height : src_height;
int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
int r = 0;
@@ -52,17 +60,17 @@
// One pass rotation is available for some formats. For the rest, convert
// to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
// and then rotate the ARGB to the final destination buffer.
- // For in-place conversion, if destination crop_argb is same as source sample,
+ // For in-place conversion, if destination dst_argb is same as source sample,
// also enable temporary buffer.
LIBYUV_BOOL need_buf =
- (rotation && format != FOURCC_ARGB) || crop_argb == sample;
- uint8* dest_argb = crop_argb;
- int dest_argb_stride = argb_stride;
- uint8* rotate_buffer = NULL;
+ (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+ uint8_t* dest_argb = dst_argb;
+ int dest_dst_stride_argb = dst_stride_argb;
+ uint8_t* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
- if (crop_argb == NULL || sample == NULL || src_width <= 0 ||
- crop_width <= 0 || src_height == 0 || crop_height == 0) {
+ if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
return -1;
}
if (src_height < 0) {
@@ -71,104 +79,117 @@
if (need_buf) {
int argb_size = crop_width * 4 * abs_crop_height;
- rotate_buffer = (uint8*)malloc(argb_size); /* NOLINT */
+ rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
- crop_argb = rotate_buffer;
- argb_stride = crop_width * 4;
+ dst_argb = rotate_buffer;
+ dst_stride_argb = crop_width * 4;
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
+ r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToARGB(src, aligned_src_width * 2, crop_argb, argb_stride,
+ r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
crop_width, inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToARGB(src, src_width * 3, crop_argb, argb_stride, crop_width,
+ r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
case FOURCC_ARGB:
if (!need_buf && !rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
- inv_crop_height);
+ r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
}
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToARGB(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_AR30:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+ inv_crop_height);
+ break;
+ case FOURCC_AB30:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
- inv_crop_height);
+ r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
- inv_crop_height);
+ r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToARGB(src, src_width * 2, crop_argb, argb_stride, crop_width,
- inv_crop_height);
+ r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
+ crop_width, inv_crop_height);
break;
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
- r = I400ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
- r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
- argb_stride, crop_width, inv_crop_height);
+ src_uv =
+ sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+ dst_stride_argb, crop_width, inv_crop_height);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ src_uv =
+ sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
// Call NV12 but with u and v parameters swapped.
- r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, crop_argb,
- argb_stride, crop_width, inv_crop_height);
+ r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+ dst_stride_argb, crop_width, inv_crop_height);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToARGB(src, src_width, crop_argb, argb_stride, crop_width,
+ r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
inv_crop_height);
break;
+
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u;
+ const uint8_t* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
@@ -183,31 +204,42 @@
halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
}
r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
- crop_argb, argb_stride, crop_width, inv_crop_height);
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
+ case FOURCC_H420: {
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
case FOURCC_J420: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
- src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
- crop_argb, argb_stride, crop_width, inv_crop_height);
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
crop_x / 2;
@@ -220,14 +252,27 @@
halfwidth * (abs_src_height + crop_y) + crop_x / 2;
}
r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
- crop_argb, argb_stride, crop_width, inv_crop_height);
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
+
+ case FOURCC_H422: {
+ int halfwidth = (src_width + 1) / 2;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u =
+ sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2;
+ const uint8_t* src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
+ break;
+ }
+
case FOURCC_I444:
case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
if (format == FOURCC_YV24) {
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -236,12 +281,12 @@
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
- crop_argb, argb_stride, crop_width, inv_crop_height);
+ dst_argb, dst_stride_argb, crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
- r = MJPGToARGB(sample, sample_size, crop_argb, argb_stride, src_width,
+ r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
abs_src_height, crop_width, inv_crop_height);
break;
#endif
@@ -251,13 +296,13 @@
if (need_buf) {
if (!r) {
- r = ARGBRotate(crop_argb, argb_stride, dest_argb, dest_argb_stride,
+ r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
crop_width, abs_crop_height, rotation);
}
free(rotate_buffer);
} else if (rotation) {
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBRotate(src, src_width * 4, crop_argb, argb_stride, crop_width,
+ r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
inv_crop_height, rotation);
}
diff --git a/files/source/convert_to_i420.cc b/files/source/convert_to_i420.cc
index a50689d..584be0a 100644
--- a/files/source/convert_to_i420.cc
+++ b/files/source/convert_to_i420.cc
@@ -25,14 +25,14 @@
// sample_size is measured in bytes and is the size of the frame.
// With MJPEG it is the compressed size of the frame.
LIBYUV_API
-int ConvertToI420(const uint8* sample,
+int ConvertToI420(const uint8_t* sample,
size_t sample_size,
- uint8* y,
- int y_stride,
- uint8* u,
- int u_stride,
- uint8* v,
- int v_stride,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
int crop_x,
int crop_y,
int src_width,
@@ -40,11 +40,11 @@
int crop_width,
int crop_height,
enum RotationMode rotation,
- uint32 fourcc) {
- uint32 format = CanonicalFourCC(fourcc);
+ uint32_t fourcc) {
+ uint32_t format = CanonicalFourCC(fourcc);
int aligned_src_width = (src_width + 1) & ~1;
- const uint8* src;
- const uint8* src_uv;
+ const uint8_t* src;
+ const uint8_t* src_uv;
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
// TODO(nisse): Why allow crop_height < 0?
const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
@@ -52,177 +52,189 @@
LIBYUV_BOOL need_buf =
(rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
format != FOURCC_NV21 && format != FOURCC_YV12) ||
- y == sample;
- uint8* tmp_y = y;
- uint8* tmp_u = u;
- uint8* tmp_v = v;
- int tmp_y_stride = y_stride;
- int tmp_u_stride = u_stride;
- int tmp_v_stride = v_stride;
- uint8* rotate_buffer = NULL;
+ dst_y == sample;
+ uint8_t* tmp_y = dst_y;
+ uint8_t* tmp_u = dst_u;
+ uint8_t* tmp_v = dst_v;
+ int tmp_y_stride = dst_stride_y;
+ int tmp_u_stride = dst_stride_u;
+ int tmp_v_stride = dst_stride_v;
+ uint8_t* rotate_buffer = NULL;
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
- if (!y || !u || !v || !sample || src_width <= 0 || crop_width <= 0 ||
- src_height == 0 || crop_height == 0) {
+ if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+ crop_width <= 0 || src_height == 0 || crop_height == 0) {
return -1;
}
// One pass rotation is available for some formats. For the rest, convert
// to I420 (with optional vertical flipping) into a temporary I420 buffer,
// and then rotate the I420 to the final destination buffer.
- // For in-place conversion, if destination y is same as source sample,
+ // For in-place conversion, if destination dst_y is same as source sample,
// also enable temporary buffer.
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
- rotate_buffer = (uint8*)malloc(y_size + uv_size * 2); /* NOLINT */
+ rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
- y = rotate_buffer;
- u = y + y_size;
- v = u + uv_size;
- y_stride = crop_width;
- u_stride = v_stride = ((crop_width + 1) / 2);
+ dst_y = rotate_buffer;
+ dst_u = dst_y + y_size;
+ dst_v = dst_u + uv_size;
+ dst_stride_y = crop_width;
+ dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
}
switch (format) {
// Single plane formats
case FOURCC_YUY2:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = YUY2ToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
- v_stride, crop_width, inv_crop_height);
+ r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_UYVY:
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
- r = UYVYToI420(src, aligned_src_width * 2, y, y_stride, u, u_stride, v,
- v_stride, crop_width, inv_crop_height);
+ r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = RGB565ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
- v_stride, crop_width, inv_crop_height);
+ r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBO:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB1555ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
- v_stride, crop_width, inv_crop_height);
+ r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_R444:
src = sample + (src_width * crop_y + crop_x) * 2;
- r = ARGB4444ToI420(src, src_width * 2, y, y_stride, u, u_stride, v,
- v_stride, crop_width, inv_crop_height);
+ r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_24BG:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RGB24ToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_RAW:
src = sample + (src_width * crop_y + crop_x) * 3;
- r = RAWToI420(src, src_width * 3, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_ARGB:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ARGBToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_BGRA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = BGRAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_ABGR:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = ABGRToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
case FOURCC_RGBA:
src = sample + (src_width * crop_y + crop_x) * 4;
- r = RGBAToI420(src, src_width * 4, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, crop_width,
+ inv_crop_height);
break;
+ // TODO(fbarchard): Add AR30 and AB30
case FOURCC_I400:
src = sample + src_width * crop_y + crop_x;
- r = I400ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, crop_width, inv_crop_height);
break;
// Biplanar formats
case FOURCC_NV12:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + (src_width * src_height) +
+ src_uv = sample + (src_width * abs_src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
- r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
- y_stride, u, u_stride, v, v_stride, crop_width,
- inv_crop_height, rotation);
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+ dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height, rotation);
break;
case FOURCC_NV21:
src = sample + (src_width * crop_y + crop_x);
- src_uv = sample + (src_width * src_height) +
+ src_uv = sample + (src_width * abs_src_height) +
((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
- // Call NV12 but with u and v parameters swapped.
- r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, y,
- y_stride, v, v_stride, u, u_stride, crop_width,
- inv_crop_height, rotation);
+ // Call NV12 but with dst_u and dst_v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+ dst_stride_y, dst_v, dst_stride_v, dst_u,
+ dst_stride_u, crop_width, inv_crop_height, rotation);
break;
case FOURCC_M420:
src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
- r = M420ToI420(src, src_width, y, y_stride, u, u_stride, v, v_stride,
- crop_width, inv_crop_height);
+ r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+ dst_v, dst_stride_v, crop_width, inv_crop_height);
break;
// Triplanar formats
case FOURCC_I420:
case FOURCC_YV12: {
- const uint8* src_y = sample + (src_width * crop_y + crop_x);
- const uint8* src_u;
- const uint8* src_v;
+ const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8_t* src_u;
+ const uint8_t* src_v;
int halfwidth = (src_width + 1) / 2;
int halfheight = (abs_src_height + 1) / 2;
if (format == FOURCC_YV12) {
- src_v = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
src_u = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
} else {
- src_u = sample + src_width * abs_src_height +
- (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) +
+ (crop_x / 2);
src_v = sample + src_width * abs_src_height +
- halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2);
}
- r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
- y_stride, u, u_stride, v, v_stride, crop_width,
- inv_crop_height, rotation);
+ r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height, rotation);
break;
}
case FOURCC_I422:
case FOURCC_YV16: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
int halfwidth = (src_width + 1) / 2;
if (format == FOURCC_YV16) {
src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
- crop_x / 2;
+ (crop_x / 2);
src_u = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
} else {
src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
- crop_x / 2;
+ (crop_x / 2);
src_v = sample + src_width * abs_src_height +
- halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ halfwidth * (abs_src_height + crop_y) + (crop_x / 2);
}
- r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, y,
- y_stride, u, u_stride, v, v_stride, crop_width,
- inv_crop_height);
+ r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height);
break;
}
case FOURCC_I444:
case FOURCC_YV24: {
- const uint8* src_y = sample + src_width * crop_y + crop_x;
- const uint8* src_u;
- const uint8* src_v;
+ const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+ const uint8_t* src_u;
+ const uint8_t* src_v;
if (format == FOURCC_YV24) {
src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -230,15 +242,16 @@
src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
}
- r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, y,
- y_stride, u, u_stride, v, v_stride, crop_width,
- inv_crop_height);
+ r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+ dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, crop_width, inv_crop_height);
break;
}
#ifdef HAVE_JPEG
case FOURCC_MJPG:
- r = MJPGToI420(sample, sample_size, y, y_stride, u, u_stride, v, v_stride,
- src_width, abs_src_height, crop_width, inv_crop_height);
+ r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+ dst_stride_u, dst_v, dst_stride_v, src_width,
+ abs_src_height, crop_width, inv_crop_height);
break;
#endif
default:
@@ -247,9 +260,10 @@
if (need_buf) {
if (!r) {
- r = I420Rotate(y, y_stride, u, u_stride, v, v_stride, tmp_y, tmp_y_stride,
- tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width,
- abs_crop_height, rotation);
+ r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+ dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+ rotation);
}
free(rotate_buffer);
}
diff --git a/files/source/cpu_id.cc b/files/source/cpu_id.cc
index afb5d28..48e2b61 100644
--- a/files/source/cpu_id.cc
+++ b/files/source/cpu_id.cc
@@ -19,16 +19,10 @@
#include <immintrin.h> // For _xgetbv()
#endif
-#if !defined(__native_client__)
-#include <stdlib.h> // For getenv()
-#endif
-
// For ArmCpuCaps() but unittested on all platforms
#include <stdio.h>
#include <string.h>
-#include "libyuv/basic_types.h" // For CPU_X86
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@@ -43,16 +37,20 @@
#define SAFEBUFFERS
#endif
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
// Low level cpuid for X86.
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)) && \
!defined(__pnacl__) && !defined(__CLR_VER)
LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
#if defined(_MSC_VER)
// Visual C version uses intrinsic or inline x86 assembly.
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
- __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+ __cpuidex(cpu_info, info_eax, info_ecx);
#elif defined(_M_IX86)
__asm {
mov eax, info_eax
@@ -66,14 +64,14 @@
}
#else // Visual C but not x86
if (info_ecx == 0) {
- __cpuid((int*)(cpu_info), info_eax);
+ __cpuid(cpu_info, info_eax);
} else {
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
}
#endif
// GCC version uses inline x86 assembly.
#else // defined(_MSC_VER)
- uint32 info_ebx, info_edx;
+ int info_ebx, info_edx;
asm volatile(
#if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit.
@@ -94,7 +92,7 @@
}
#else // (defined(_M_IX86) || defined(_M_X64) ...
LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+void CpuId(int eax, int ecx, int* cpu_info) {
(void)eax;
(void)ecx;
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
@@ -118,9 +116,9 @@
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
int GetXCR0() {
- uint32 xcr0 = 0u;
+ int xcr0 = 0;
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
- xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required.
+ xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
#elif defined(__i386__) || defined(__x86_64__)
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
#endif // defined(__i386__) || defined(__x86_64__)
@@ -154,7 +152,7 @@
}
// aarch64 uses asimd for Neon.
p = strstr(cpuinfo_line, " asimd");
- if (p && (p[6] == ' ' || p[6] == '\n')) {
+ if (p) {
fclose(f);
return kCpuHasNEON;
}
@@ -164,27 +162,40 @@
return 0;
}
+// TODO(fbarchard): Consider read_msa_ir().
+// TODO(fbarchard): Add unittest.
LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
const char ase[]) {
char cpuinfo_line[512];
- int len = (int)strlen(ase);
FILE* f = fopen(cpuinfo_name, "r");
if (!f) {
// ase enabled if /proc/cpuinfo is unavailable.
if (strcmp(ase, " msa") == 0) {
return kCpuHasMSA;
}
- return kCpuHasDSPR2;
+ if (strcmp(ase, " mmi") == 0) {
+ return kCpuHasMMI;
+ }
+ return 0;
}
while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
char* p = strstr(cpuinfo_line, ase);
- if (p && (p[len] == ' ' || p[len] == '\n')) {
+ if (p) {
fclose(f);
if (strcmp(ase, " msa") == 0) {
return kCpuHasMSA;
}
- return kCpuHasDSPR2;
+ return 0;
+ }
+ } else if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
+ char* p = strstr(cpuinfo_line, "Loongson-3");
+ if (p) {
+ fclose(f);
+ if (strcmp(ase, " mmi") == 0) {
+ return kCpuHasMMI;
+ }
+ return 0;
}
}
}
@@ -192,35 +203,14 @@
return 0;
}
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = 0; // cpu_info is not initialized yet.
-
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
- const char* var = getenv(name);
- if (var) {
- if (var[0] != '0') {
- return LIBYUV_TRUE;
- }
- }
- return LIBYUV_FALSE;
-}
-#else // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
- return LIBYUV_FALSE;
-}
-#endif
-
-LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) {
+static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
- uint32 cpu_info0[4] = {0, 0, 0, 0};
- uint32 cpu_info1[4] = {0, 0, 0, 0};
- uint32 cpu_info7[4] = {0, 0, 0, 0};
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86))
+ int cpu_info0[4] = {0, 0, 0, 0};
+ int cpu_info1[4] = {0, 0, 0, 0};
+ int cpu_info7[4] = {0, 0, 0, 0};
CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
if (cpu_info0[0] >= 7) {
@@ -241,60 +231,23 @@
// Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) {
- cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+ cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+ cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+ cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+ cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+ cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
}
}
-
- // Environment variable overrides for testing.
- if (TestEnv("LIBYUV_DISABLE_X86")) {
- cpu_info &= ~kCpuHasX86;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE2")) {
- cpu_info &= ~kCpuHasSSE2;
- }
- if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
- cpu_info &= ~kCpuHasSSSE3;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE41")) {
- cpu_info &= ~kCpuHasSSE41;
- }
- if (TestEnv("LIBYUV_DISABLE_SSE42")) {
- cpu_info &= ~kCpuHasSSE42;
- }
- if (TestEnv("LIBYUV_DISABLE_AVX")) {
- cpu_info &= ~kCpuHasAVX;
- }
- if (TestEnv("LIBYUV_DISABLE_AVX2")) {
- cpu_info &= ~kCpuHasAVX2;
- }
- if (TestEnv("LIBYUV_DISABLE_ERMS")) {
- cpu_info &= ~kCpuHasERMS;
- }
- if (TestEnv("LIBYUV_DISABLE_FMA3")) {
- cpu_info &= ~kCpuHasFMA3;
- }
- if (TestEnv("LIBYUV_DISABLE_AVX3")) {
- cpu_info &= ~kCpuHasAVX3;
- }
- if (TestEnv("LIBYUV_DISABLE_F16C")) {
- cpu_info &= ~kCpuHasF16C;
- }
-
#endif
#if defined(__mips__) && defined(__linux__)
-#if defined(__mips_dspr2)
- cpu_info |= kCpuHasDSPR2;
-#endif
#if defined(__mips_msa)
cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
+#elif defined(_MIPS_ARCH_LOONGSON3A)
+ cpu_info = MipsCpuCaps("/proc/cpuinfo", " mmi");
#endif
cpu_info |= kCpuHasMIPS;
- if (getenv("LIBYUV_DISABLE_DSPR2")) {
- cpu_info &= ~kCpuHasDSPR2;
- }
- if (getenv("LIBYUV_DISABLE_MSA")) {
- cpu_info &= ~kCpuHasMSA;
- }
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
@@ -313,22 +266,22 @@
cpu_info = ArmCpuCaps("/proc/cpuinfo");
#endif
cpu_info |= kCpuHasARM;
- if (TestEnv("LIBYUV_DISABLE_NEON")) {
- cpu_info &= ~kCpuHasNEON;
- }
#endif // __arm__
- if (TestEnv("LIBYUV_DISABLE_ASM")) {
- cpu_info = 0;
- }
cpu_info |= kCpuInitialized;
- cpu_info_ = cpu_info;
return cpu_info;
}
// Note that use of this function is not thread safe.
LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
- cpu_info_ = InitCpuFlags() & enable_flags;
+int MaskCpuFlags(int enable_flags) {
+ int cpu_info = GetCpuFlags() & enable_flags;
+ SetCpuFlags(cpu_info);
+ return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+ return MaskCpuFlags(-1);
}
#ifdef __cplusplus
diff --git a/files/source/mjpeg_decoder.cc b/files/source/mjpeg_decoder.cc
index b43c008..5c5e5ea 100644
--- a/files/source/mjpeg_decoder.cc
+++ b/files/source/mjpeg_decoder.cc
@@ -25,7 +25,8 @@
#endif
#endif
-struct FILE; // For jpeglib.h.
+
+#include <stdio.h> // For jpeglib.h.
// C++ build requires extern C for jpeg internals.
#ifdef __cplusplus
@@ -102,7 +103,7 @@
DestroyOutputBuffers();
}
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
if (!ValidateJpeg(src, src_len)) {
return LIBYUV_FALSE;
}
@@ -129,7 +130,7 @@
if (scanlines_[i]) {
delete scanlines_[i];
}
- scanlines_[i] = new uint8*[scanlines_size];
+ scanlines_[i] = new uint8_t*[scanlines_size];
scanlines_sizes_[i] = scanlines_size;
}
@@ -145,7 +146,7 @@
if (databuf_[i]) {
delete databuf_[i];
}
- databuf_[i] = new uint8[databuf_size];
+ databuf_[i] = new uint8_t[databuf_size];
databuf_strides_[i] = databuf_stride;
}
@@ -243,7 +244,7 @@
}
// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8** planes,
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
int dst_width,
int dst_height) {
if (dst_width != GetWidth() || dst_height > GetHeight()) {
@@ -427,7 +428,15 @@
}
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
- cinfo->src->next_input_byte += num_bytes;
+ jpeg_source_mgr* src = cinfo->src;
+ size_t bytes = static_cast<size_t>(num_bytes);
+ if(bytes > src->bytes_in_buffer) {
+ src->next_input_byte = nullptr;
+ src->bytes_in_buffer = 0;
+ } else {
+ src->next_input_byte += bytes;
+ src->bytes_in_buffer -= bytes;
+ }
}
void term_source(j_decompress_ptr cinfo) {
@@ -469,9 +478,9 @@
// it.
DestroyOutputBuffers();
- scanlines_ = new uint8**[num_outbufs];
+ scanlines_ = new uint8_t**[num_outbufs];
scanlines_sizes_ = new int[num_outbufs];
- databuf_ = new uint8*[num_outbufs];
+ databuf_ = new uint8_t*[num_outbufs];
databuf_strides_ = new int[num_outbufs];
for (int i = 0; i < num_outbufs; ++i) {
@@ -527,9 +536,9 @@
return LIBYUV_TRUE;
}
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
for (int i = 0; i < num_outbufs_; ++i) {
- uint8* data_i = data[i];
+ uint8_t* data_i = data[i];
for (int j = 0; j < scanlines_sizes_[i]; ++j) {
scanlines_[i][j] = data_i;
data_i += GetComponentStride(i);
@@ -552,13 +561,13 @@
if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
return kJpegYuv420;
- } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 2 && subsample_y[1] == 1 &&
- subsample_x[2] == 2 && subsample_y[2] == 1) {
+ }
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+ subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
return kJpegYuv422;
- } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
- subsample_x[1] == 1 && subsample_y[1] == 1 &&
- subsample_x[2] == 1 && subsample_y[2] == 1) {
+ }
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+ subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
return kJpegYuv444;
}
} else if (number_of_components == 1) { // Grey-scale images.
diff --git a/files/source/mjpeg_validate.cc b/files/source/mjpeg_validate.cc
index 1a17dd7..ba0a03a 100644
--- a/files/source/mjpeg_validate.cc
+++ b/files/source/mjpeg_validate.cc
@@ -18,13 +18,13 @@
#endif
// Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
- if (sample_size >= 2) {
- const uint8* end = sample + sample_size - 1;
- const uint8* it = sample;
+static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) {
+ if (src_size_mjpg >= 2) {
+ const uint8_t* end = src_mjpg + src_size_mjpg - 1;
+ const uint8_t* it = src_mjpg;
while (it < end) {
// TODO(fbarchard): scan for 0xd9 instead.
- it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+ it = (const uint8_t*)(memchr(it, 0xff, end - it));
if (it == NULL) {
break;
}
@@ -34,34 +34,35 @@
++it; // Skip over current 0xff.
}
}
- // ERROR: Invalid jpeg end code not found. Size sample_size
+ // ERROR: Invalid jpeg end code not found. Size src_size_mjpg
return LIBYUV_FALSE;
}
// Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
// Maximum size that ValidateJpeg will consider valid.
const size_t kMaxJpegSize = 0x7fffffffull;
const size_t kBackSearchSize = 1024;
- if (sample_size < 64 || sample_size > kMaxJpegSize || !sample) {
- // ERROR: Invalid jpeg size: sample_size
+ if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) {
+ // ERROR: Invalid jpeg size: src_size_mjpg
return LIBYUV_FALSE;
}
- if (sample[0] != 0xff || sample[1] != 0xd8) { // SOI marker
+ // SOI marker
+ if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
// ERROR: Invalid jpeg initial start code
return LIBYUV_FALSE;
}
// Look for the End Of Image (EOI) marker near the end of the buffer.
- if (sample_size > kBackSearchSize) {
- if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+ if (src_size_mjpg > kBackSearchSize) {
+ if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) {
return LIBYUV_TRUE; // Success: Valid jpeg.
}
// Reduce search size for forward search.
- sample_size = sample_size - kBackSearchSize + 1;
+ src_size_mjpg = src_size_mjpg - kBackSearchSize + 1;
}
// Step over SOI marker and scan for EOI.
- return ScanEOI(sample + 2, sample_size - 2);
+ return ScanEOI(src_mjpg + 2, src_size_mjpg - 2);
}
#ifdef __cplusplus
diff --git a/files/source/planar_functions.cc b/files/source/planar_functions.cc
index b8a53e8..9cab230 100644
--- a/files/source/planar_functions.cc
+++ b/files/source/planar_functions.cc
@@ -26,14 +26,14 @@
// Copy a plane of data
LIBYUV_API
-void CopyPlane(const uint8* src_y,
+void CopyPlane(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -50,6 +50,7 @@
if (src_y == dst_y && src_stride_y == dst_stride_y) {
return;
}
+
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -70,11 +71,6 @@
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
// Copy plane
for (y = 0; y < height; ++y) {
@@ -87,14 +83,14 @@
// TODO(fbarchard): Consider support for negative height.
// TODO(fbarchard): Consider stride measured in bytes.
LIBYUV_API
-void CopyPlane_16(const uint16* src_y,
+void CopyPlane_16(const uint16_t* src_y,
int src_stride_y,
- uint16* dst_y,
+ uint16_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+ void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
// Coalesce rows.
if (src_stride_y == width && dst_stride_y == width) {
width *= height;
@@ -116,11 +112,6 @@
CopyRow = CopyRow_16_NEON;
}
#endif
-#if defined(HAS_COPYROW_16_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_16_MIPS;
- }
-#endif
// Copy plane
for (y = 0; y < height; ++y) {
@@ -130,19 +121,119 @@
}
}
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+ int src_stride_y,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height) {
+ int y;
+ void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+ int width) = Convert16To8Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ Convert16To8Row = Convert16To8Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ Convert16To8Row = Convert16To8Row_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Convert16To8Row = Convert16To8Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ Convert16To8Row = Convert16To8Row_AVX2;
+ }
+ }
+#endif
+
+ // Convert plane
+ for (y = 0; y < height; ++y) {
+ Convert16To8Row(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+ int src_stride_y,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ int scale, // 16384 for 10 bits
+ int width,
+ int height) {
+ int y;
+ void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+ int width) = Convert8To16Row_C;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_stride_y = -dst_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width && dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ Convert8To16Row = Convert8To16Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ Convert8To16Row = Convert8To16Row_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ Convert8To16Row = Convert8To16Row_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ Convert8To16Row = Convert8To16Row_AVX2;
+ }
+ }
+#endif
+
+ // Convert plane
+ for (y = 0; y < height; ++y) {
+ Convert8To16Row(src_y, dst_y, scale, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
// Copy I422.
LIBYUV_API
-int I422Copy(const uint8* src_y,
+int I422Copy(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -171,17 +262,17 @@
// Copy I444.
LIBYUV_API
-int I444Copy(const uint8* src_y,
+int I444Copy(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -209,9 +300,9 @@
// Copy I400.
LIBYUV_API
-int I400ToI400(const uint8* src_y,
+int I400ToI400(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
@@ -230,13 +321,13 @@
// Convert I420 to I400.
LIBYUV_API
-int I420ToI400(const uint8* src_y,
+int I420ToI400(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
@@ -261,16 +352,16 @@
// Support function for NV12 etc UV channels.
// Width and height are plane sizes (typically half pixel width).
LIBYUV_API
-void SplitUVPlane(const uint8* src_uv,
+void SplitUVPlane(const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
// Negative height means invert the image.
if (height < 0) {
@@ -311,13 +402,19 @@
}
}
#endif
-#if defined(HAS_SPLITUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_u, 4) &&
- IS_ALIGNED(dst_stride_u, 4) && IS_ALIGNED(dst_v, 4) &&
- IS_ALIGNED(dst_stride_v, 4)) {
- SplitUVRow = SplitUVRow_Any_DSPR2;
- if (IS_ALIGNED(width, 16)) {
- SplitUVRow = SplitUVRow_DSPR2;
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
}
}
#endif
@@ -332,18 +429,17 @@
}
LIBYUV_API
-void MergeUVPlane(const uint8* src_u,
+void MergeUVPlane(const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
- void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
- int width) = MergeUVRow_C;
- // Coalesce rows.
+ void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+ uint8_t* dst_uv, int width) = MergeUVRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -389,6 +485,14 @@
}
}
#endif
+#if defined(HAS_MERGEUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeUVRow = MergeUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeUVRow = MergeUVRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.
@@ -399,15 +503,204 @@
}
}
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_vu,
+ int src_stride_vu,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_uv,
+ int dst_stride_uv,
+ int width,
+ int height) {
+ int y;
+ void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+ UVToVURow_C;
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+ src_stride_y = -src_stride_y;
+ src_stride_vu = -src_stride_vu;
+ }
+ // Coalesce rows.
+ if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_vu = dst_stride_uv = 0;
+ }
+
+#if defined(HAS_UVToVUROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ UVToVURow = UVToVURow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ UVToVURow = UVToVURow_NEON;
+ }
+ }
+#endif
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ for (y = 0; y < halfheight; ++y) {
+ UVToVURow(src_vu, dst_uv, halfwidth);
+ src_vu += src_stride_vu;
+ dst_uv += dst_stride_uv;
+ }
+ return 0;
+}
+
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+ int src_stride_rgb,
+ uint8_t* dst_r,
+ int dst_stride_r,
+ uint8_t* dst_g,
+ int dst_stride_g,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width,
+ int height) {
+ int y;
+ void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+ uint8_t* dst_b, int width) = SplitRGBRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_r = dst_r + (height - 1) * dst_stride_r;
+ dst_g = dst_g + (height - 1) * dst_stride_g;
+ dst_b = dst_b + (height - 1) * dst_stride_b;
+ dst_stride_r = -dst_stride_r;
+ dst_stride_g = -dst_stride_g;
+ dst_stride_b = -dst_stride_b;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+ dst_stride_g == width && dst_stride_b == width) {
+ width *= height;
+ height = 1;
+ src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+ }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SplitRGBRow = SplitRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ SplitRGBRow = SplitRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SplitRGBRow = SplitRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ SplitRGBRow = SplitRGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITRGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitRGBRow = SplitRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ SplitRGBRow = SplitRGBRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Copy a row of RGB.
+ SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+ dst_r += dst_stride_r;
+ dst_g += dst_stride_g;
+ dst_b += dst_stride_b;
+ src_rgb += src_stride_rgb;
+ }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+ int src_stride_r,
+ const uint8_t* src_g,
+ int src_stride_g,
+ const uint8_t* src_b,
+ int src_stride_b,
+ uint8_t* dst_rgb,
+ int dst_stride_rgb,
+ int width,
+ int height) {
+ int y;
+ void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+ const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+ MergeRGBRow_C;
+ // Coalesce rows.
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+ dst_stride_rgb = -dst_stride_rgb;
+ }
+ // Coalesce rows.
+ if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+ dst_stride_rgb == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+ }
+#if defined(HAS_MERGERGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ MergeRGBRow = MergeRGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ MergeRGBRow = MergeRGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ MergeRGBRow = MergeRGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ MergeRGBRow = MergeRGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_MERGERGBROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MergeRGBRow = MergeRGBRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MergeRGBRow = MergeRGBRow_MMI;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ // Merge a row of U and V into a row of RGB.
+ MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+ src_r += src_stride_r;
+ src_g += src_stride_g;
+ src_b += src_stride_b;
+ dst_rgb += dst_stride_rgb;
+ }
+}
+
// Mirror a plane of data.
-void MirrorPlane(const uint8* src_y,
+void MirrorPlane(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@@ -438,14 +731,6 @@
}
}
#endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_y, 4) &&
- IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(dst_y, 4) &&
- IS_ALIGNED(dst_stride_y, 4)) {
- MirrorRow = MirrorRow_DSPR2;
- }
-#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
@@ -454,6 +739,14 @@
}
}
#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -465,20 +758,20 @@
// Convert YUY2 to I422.
LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2,
+int YUY2ToI422(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int width) = YUY2ToUV422Row_C;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
YUY2ToYRow_C;
if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -537,6 +830,16 @@
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ YUY2ToUV422Row = YUY2ToUV422Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -551,20 +854,20 @@
// Convert UYVY to I422.
LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy,
+int UYVYToI422(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
- void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int width) = UYVYToUV422Row_C;
- void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int width) =
+ void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+ uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+ void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
UYVYToYRow_C;
if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -623,6 +926,16 @@
}
}
#endif
+#if defined(HAS_UYVYTOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ UYVYToYRow = UYVYToYRow_Any_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_Any_MMI;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_MMI;
+ UYVYToUV422Row = UYVYToUV422Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -637,14 +950,14 @@
// Convert YUY2 to Y.
LIBYUV_API
-int YUY2ToY(const uint8* src_yuy2,
+int YUY2ToY(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+ void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
YUY2ToYRow_C;
if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
return -1;
@@ -693,6 +1006,14 @@
}
}
#endif
+#if defined(HAS_YUY2TOYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ YUY2ToYRow = YUY2ToYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToYRow = YUY2ToYRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
YUY2ToYRow(src_yuy2, dst_y, width);
@@ -704,9 +1025,9 @@
// Mirror I400 with optional flipping
LIBYUV_API
-int I400Mirror(const uint8* src_y,
+int I400Mirror(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
@@ -726,17 +1047,17 @@
// Mirror I420 with optional flipping
LIBYUV_API
-int I420Mirror(const uint8* src_y,
+int I420Mirror(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
@@ -768,14 +1089,14 @@
// ARGB mirror.
LIBYUV_API
-int ARGBMirror(const uint8* src_argb,
+int ARGBMirror(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
ARGBMirrorRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -818,6 +1139,14 @@
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
// Mirror plane
for (y = 0; y < height; ++y) {
@@ -833,8 +1162,8 @@
// the same blend function for all pixels if possible.
LIBYUV_API
ARGBBlendRow GetARGBBlend() {
- void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width) = ARGBBlendRow_C;
+ void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+ uint8_t* dst_argb, int width) = ARGBBlendRow_C;
#if defined(HAS_ARGBBLENDROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBBlendRow = ARGBBlendRow_SSSE3;
@@ -846,22 +1175,32 @@
ARGBBlendRow = ARGBBlendRow_NEON;
}
#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBBlendRow = ARGBBlendRow_MSA;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBBlendRow = ARGBBlendRow_MMI;
+ }
+#endif
return ARGBBlendRow;
}
// Alpha Blend 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBBlend(const uint8* src_argb0,
+int ARGBBlend(const uint8_t* src_argb0,
int src_stride_argb0,
- const uint8* src_argb1,
+ const uint8_t* src_argb1,
int src_stride_argb1,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
- uint8* dst_argb, int width) = GetARGBBlend();
+ void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+ uint8_t* dst_argb, int width) = GetARGBBlend();
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -890,19 +1229,19 @@
// Alpha Blend plane and store to destination.
LIBYUV_API
-int BlendPlane(const uint8* src_y0,
+int BlendPlane(const uint8_t* src_y0,
int src_stride_y0,
- const uint8* src_y1,
+ const uint8_t* src_y1,
int src_stride_y1,
- const uint8* alpha,
+ const uint8_t* alpha,
int alpha_stride,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
- void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) =
+ void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+ const uint8_t* alpha, uint8_t* dst, int width) =
BlendPlaneRow_C;
if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
return -1;
@@ -938,6 +1277,14 @@
}
}
#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
@@ -952,36 +1299,36 @@
#define MAXTWIDTH 2048
// Alpha Blend YUV images and store to destination.
LIBYUV_API
-int I420Blend(const uint8* src_y0,
+int I420Blend(const uint8_t* src_y0,
int src_stride_y0,
- const uint8* src_u0,
+ const uint8_t* src_u0,
int src_stride_u0,
- const uint8* src_v0,
+ const uint8_t* src_v0,
int src_stride_v0,
- const uint8* src_y1,
+ const uint8_t* src_y1,
int src_stride_y1,
- const uint8* src_u1,
+ const uint8_t* src_u1,
int src_stride_u1,
- const uint8* src_v1,
+ const uint8_t* src_v1,
int src_stride_v1,
- const uint8* alpha,
+ const uint8_t* alpha,
int alpha_stride,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
// Half width/height for UV.
int halfwidth = (width + 1) >> 1;
- void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
- const uint8* alpha, uint8* dst, int width) =
+ void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+ const uint8_t* alpha, uint8_t* dst, int width) =
BlendPlaneRow_C;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+ void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
!alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
return -1;
@@ -1014,6 +1361,14 @@
}
}
#endif
+#if defined(HAS_BLENDPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ BlendPlaneRow = BlendPlaneRow_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ BlendPlaneRow = BlendPlaneRow_MMI;
+ }
+ }
+#endif
if (!IS_ALIGNED(width, 2)) {
ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
}
@@ -1050,6 +1405,17 @@
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ScaleRowDown2 = ScaleRowDown2Box_Any_MMI;
+ if (IS_ALIGNED(halfwidth, 8)) {
+ ScaleRowDown2 = ScaleRowDown2Box_MMI;
+ }
+ }
+ }
+#endif
// Row buffer for intermediate alpha pixels.
align_buffer_64(halfalpha, halfwidth);
@@ -1076,17 +1442,17 @@
// Multiply 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0,
+int ARGBMultiply(const uint8_t* src_argb0,
int src_stride_argb0,
- const uint8* src_argb1,
+ const uint8_t* src_argb1,
int src_stride_argb1,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
- int width) = ARGBMultiplyRow_C;
+ void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+ uint8_t* dst, int width) = ARGBMultiplyRow_C;
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1135,6 +1501,14 @@
}
}
#endif
+#if defined(HAS_ARGBMULTIPLYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_MMI;
+ }
+ }
+#endif
// Multiply plane
for (y = 0; y < height; ++y) {
@@ -1148,16 +1522,16 @@
// Add 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBAdd(const uint8* src_argb0,
+int ARGBAdd(const uint8_t* src_argb0,
int src_stride_argb0,
- const uint8* src_argb1,
+ const uint8_t* src_argb1,
int src_stride_argb1,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
int width) = ARGBAddRow_C;
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1212,6 +1586,14 @@
}
}
#endif
+#if defined(HAS_ARGBADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAddRow = ARGBAddRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAddRow = ARGBAddRow_MMI;
+ }
+ }
+#endif
// Add plane
for (y = 0; y < height; ++y) {
@@ -1225,17 +1607,17 @@
// Subtract 2 ARGB images and store to destination.
LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0,
+int ARGBSubtract(const uint8_t* src_argb0,
int src_stride_argb0,
- const uint8* src_argb1,
+ const uint8_t* src_argb1,
int src_stride_argb1,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
- int width) = ARGBSubtractRow_C;
+ void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+ uint8_t* dst, int width) = ARGBSubtractRow_C;
if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1284,6 +1666,14 @@
}
}
#endif
+#if defined(HAS_ARGBSUBTRACTROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBSubtractRow = ARGBSubtractRow_MMI;
+ }
+ }
+#endif
// Subtract plane
for (y = 0; y < height; ++y) {
@@ -1295,20 +1685,20 @@
return 0;
}
// Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y,
+static int I422ToRGBAMatrix(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
const struct YuvConstants* yuvconstants,
int width,
int height) {
int y;
- void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf,
+ void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) =
I422ToRGBARow_C;
if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
@@ -1344,15 +1734,6 @@
}
}
#endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
- I422ToRGBARow = I422ToRGBARow_DSPR2;
- }
-#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
@@ -1374,13 +1755,13 @@
// Convert I422 to RGBA.
LIBYUV_API
-int I422ToRGBA(const uint8* src_y,
+int I422ToRGBA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_rgba,
+ uint8_t* dst_rgba,
int dst_stride_rgba,
int width,
int height) {
@@ -1391,13 +1772,13 @@
// Convert I422 to BGRA.
LIBYUV_API
-int I422ToBGRA(const uint8* src_y,
+int I422ToBGRA(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_bgra,
+ uint8_t* dst_bgra,
int dst_stride_bgra,
int width,
int height) {
@@ -1410,17 +1791,17 @@
// Convert NV12 to RGB565.
LIBYUV_API
-int NV12ToRGB565(const uint8* src_y,
+int NV12ToRGB565(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_rgb565,
+ uint8_t* dst_rgb565,
int dst_stride_rgb565,
int width,
int height) {
int y;
void (*NV12ToRGB565Row)(
- const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf,
+ const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
return -1;
@@ -1477,14 +1858,14 @@
// Convert RAW to RGB24.
LIBYUV_API
-int RAWToRGB24(const uint8* src_raw,
+int RAWToRGB24(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_rgb24,
+ uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
int y;
- void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+ void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
RAWToRGB24Row_C;
if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
return -1;
@@ -1525,6 +1906,14 @@
}
}
#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ RAWToRGB24Row = RAWToRGB24Row_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ RAWToRGB24Row = RAWToRGB24Row_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1535,13 +1924,13 @@
}
LIBYUV_API
-void SetPlane(uint8* dst_y,
+void SetPlane(uint8_t* dst_y,
int dst_stride_y,
int width,
int height,
- uint32 value) {
+ uint32_t value) {
int y;
- void (*SetRow)(uint8 * dst, uint8 value, int width) = SetRow_C;
+ void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1574,6 +1963,11 @@
SetRow = SetRow_ERMS;
}
#endif
+#if defined(HAS_SETROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+ SetRow = SetRow_MSA;
+ }
+#endif
// Set plane
for (y = 0; y < height; ++y) {
@@ -1584,11 +1978,11 @@
// Draw a rectangle into I420
LIBYUV_API
-int I420Rect(uint8* dst_y,
+int I420Rect(uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int x,
int y,
@@ -1599,9 +1993,9 @@
int value_v) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
- uint8* start_y = dst_y + y * dst_stride_y + x;
- uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
- uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+ uint8_t* start_y = dst_y + y * dst_stride_y + x;
+ uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
value_v < 0 || value_v > 255) {
@@ -1616,15 +2010,16 @@
// Draw a rectangle into ARGB
LIBYUV_API
-int ARGBRect(uint8* dst_argb,
+int ARGBRect(uint8_t* dst_argb,
int dst_stride_argb,
int dst_x,
int dst_y,
int width,
int height,
- uint32 value) {
+ uint32_t value) {
int y;
- void (*ARGBSetRow)(uint8 * dst_argb, uint32 value, int width) = ARGBSetRow_C;
+ void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+ ARGBSetRow_C;
if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
@@ -1685,15 +2080,15 @@
// f is foreground pixel premultiplied by alpha
LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb,
+int ARGBAttenuate(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) =
- ARGBAttenuateRow_C;
+ void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBAttenuateRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -1740,6 +2135,14 @@
}
}
#endif
+#if defined(HAS_ARGBATTENUATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1751,14 +2154,14 @@
// Convert preattentuated ARGB to unattenuated ARGB.
LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb,
+int ARGBUnattenuate(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
int width) = ARGBUnattenuateRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1802,14 +2205,14 @@
// Convert ARGB to Grayed ARGB.
LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb,
+int ARGBGrayTo(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
ARGBGrayRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -1840,6 +2243,11 @@
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(src_argb, dst_argb, width);
@@ -1851,16 +2259,16 @@
// Make a rectangle of ARGB gray scale.
LIBYUV_API
-int ARGBGray(uint8* dst_argb,
+int ARGBGray(uint8_t* dst_argb,
int dst_stride_argb,
int dst_x,
int dst_y,
int width,
int height) {
int y;
- void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
ARGBGrayRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
@@ -1885,6 +2293,11 @@
ARGBGrayRow = ARGBGrayRow_MSA;
}
#endif
+#if defined(HAS_ARGBGRAYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBGrayRow = ARGBGrayRow_MMI;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBGrayRow(dst, dst, width);
@@ -1895,15 +2308,15 @@
// Make a rectangle of ARGB Sepia tone.
LIBYUV_API
-int ARGBSepia(uint8* dst_argb,
+int ARGBSepia(uint8_t* dst_argb,
int dst_stride_argb,
int dst_x,
int dst_y,
int width,
int height) {
int y;
- void (*ARGBSepiaRow)(uint8 * dst_argb, int width) = ARGBSepiaRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
@@ -1928,6 +2341,11 @@
ARGBSepiaRow = ARGBSepiaRow_MSA;
}
#endif
+#if defined(HAS_ARGBSEPIAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBSepiaRow = ARGBSepiaRow_MMI;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBSepiaRow(dst, width);
@@ -1939,16 +2357,16 @@
// Apply a 4x4 matrix to each ARGB pixel.
// Note: Normally for shading, but can be used to swizzle or invert.
LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb,
+int ARGBColorMatrix(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- const int8* matrix_argb,
+ const int8_t* matrix_argb,
int width,
int height) {
int y;
- void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
- const int8* matrix_argb, int width) =
+ void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ const int8_t* matrix_argb, int width) =
ARGBColorMatrixRow_C;
if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
return -1;
@@ -1974,6 +2392,16 @@
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+ }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_MMI;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
src_argb += src_stride_argb;
@@ -1985,15 +2413,15 @@
// Apply a 4x3 matrix to each ARGB pixel.
// Deprecated.
LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb,
+int RGBColorMatrix(uint8_t* dst_argb,
int dst_stride_argb,
- const int8* matrix_rgb,
+ const int8_t* matrix_rgb,
int dst_x,
int dst_y,
int width,
int height) {
- SIMD_ALIGNED(int8 matrix_argb[16]);
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ SIMD_ALIGNED(int8_t matrix_argb[16]);
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
dst_y < 0) {
return -1;
@@ -2015,24 +2443,24 @@
matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
matrix_argb[15] = 64; // 1.0
- return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb, dst,
+ return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
dst_stride_argb, &matrix_argb[0], width, height);
}
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
-int ARGBColorTable(uint8* dst_argb,
+int ARGBColorTable(uint8_t* dst_argb,
int dst_stride_argb,
- const uint8* table_argb,
+ const uint8_t* table_argb,
int dst_x,
int dst_y,
int width,
int height) {
int y;
- void (*ARGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
+ void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
int width) = ARGBColorTableRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
dst_y < 0) {
return -1;
@@ -2058,17 +2486,17 @@
// Apply a color table each ARGB pixel but preserve destination alpha.
// Table contains 256 ARGB values.
LIBYUV_API
-int RGBColorTable(uint8* dst_argb,
+int RGBColorTable(uint8_t* dst_argb,
int dst_stride_argb,
- const uint8* table_argb,
+ const uint8_t* table_argb,
int dst_x,
int dst_y,
int width,
int height) {
int y;
- void (*RGBColorTableRow)(uint8 * dst_argb, const uint8* table_argb,
+ void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
int width) = RGBColorTableRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
dst_y < 0) {
return -1;
@@ -2101,7 +2529,7 @@
// Caveat - although SSE2 saturates, the C function does not and should be used
// with care if doing anything but quantization.
LIBYUV_API
-int ARGBQuantize(uint8* dst_argb,
+int ARGBQuantize(uint8_t* dst_argb,
int dst_stride_argb,
int scale,
int interval_size,
@@ -2111,9 +2539,9 @@
int width,
int height) {
int y;
- void (*ARGBQuantizeRow)(uint8 * dst_argb, int scale, int interval_size,
+ void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
int interval_offset, int width) = ARGBQuantizeRow_C;
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
interval_size < 1 || interval_size > 255) {
return -1;
@@ -2134,6 +2562,11 @@
ARGBQuantizeRow = ARGBQuantizeRow_NEON;
}
#endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
dst += dst_stride_argb;
@@ -2144,17 +2577,17 @@
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb,
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
int src_stride_argb,
- int32* dst_cumsum,
+ int32_t* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height) {
int y;
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) =
+ void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+ const int32_t* previous_cumsum, int width) =
ComputeCumulativeSumRow_C;
- int32* previous_cumsum = dst_cumsum;
+ int32_t* previous_cumsum = dst_cumsum;
if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
return -1;
}
@@ -2163,6 +2596,12 @@
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
}
#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
+
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
for (y = 0; y < height; ++y) {
ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
@@ -2178,25 +2617,25 @@
// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
// as the buffer is treated as circular.
LIBYUV_API
-int ARGBBlur(const uint8* src_argb,
+int ARGBBlur(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- int32* dst_cumsum,
+ int32_t* dst_cumsum,
int dst_stride32_cumsum,
int width,
int height,
int radius) {
int y;
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
- const int32* previous_cumsum, int width) =
+ void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+ const int32_t* previous_cumsum, int width) =
ComputeCumulativeSumRow_C;
- void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
- int width, int area, uint8* dst,
- int count) = CumulativeSumToAverageRow_C;
- int32* cumsum_bot_row;
- int32* max_cumsum_bot_row;
- int32* cumsum_top_row;
+ void (*CumulativeSumToAverageRow)(
+ const int32_t* topleft, const int32_t* botleft, int width, int area,
+ uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+ int32_t* cumsum_bot_row;
+ int32_t* max_cumsum_bot_row;
+ int32_t* cumsum_top_row;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
@@ -2221,6 +2660,11 @@
CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI;
+ }
+#endif
// Compute enough CumulativeSum for first row to be blurred. After this
// one row of CumulativeSum is updated at a time.
ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
@@ -2250,7 +2694,7 @@
// Increment cumsum_bot_row pointer with circular buffer wrap around and
// then fill in a row of CumulativeSum.
if ((y + radius) < height) {
- const int32* prev_cumsum_bot_row = cumsum_bot_row;
+ const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
cumsum_bot_row += dst_stride32_cumsum;
if (cumsum_bot_row >= max_cumsum_bot_row) {
cumsum_bot_row = dst_cumsum;
@@ -2288,16 +2732,16 @@
// Multiply ARGB image by a specified ARGB value.
LIBYUV_API
-int ARGBShade(const uint8* src_argb,
+int ARGBShade(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
- uint32 value) {
+ uint32_t value) {
int y;
- void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width,
- uint32 value) = ARGBShadeRow_C;
+ void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+ uint32_t value) = ARGBShadeRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
return -1;
}
@@ -2327,6 +2771,11 @@
ARGBShadeRow = ARGBShadeRow_MSA;
}
#endif
+#if defined(HAS_ARGBSHADEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) {
+ ARGBShadeRow = ARGBShadeRow_MMI;
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -2338,17 +2787,17 @@
// Interpolate 2 planes by specified amount (0 to 255).
LIBYUV_API
-int InterpolatePlane(const uint8* src0,
+int InterpolatePlane(const uint8_t* src0,
int src_stride0,
- const uint8* src1,
+ const uint8_t* src1,
int src_stride1,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height,
int interpolation) {
int y;
- void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -2390,14 +2839,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src0, 4) &&
- IS_ALIGNED(src_stride0, 4) && IS_ALIGNED(src1, 4) &&
- IS_ALIGNED(src_stride1, 4) && IS_ALIGNED(dst, 4) &&
- IS_ALIGNED(dst_stride, 4) && IS_ALIGNED(width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -2406,6 +2847,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
InterpolateRow(dst, src0, src1 - src0, width, interpolation);
@@ -2418,11 +2867,11 @@
// Interpolate 2 ARGB images by specified amount (0 to 255).
LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0,
+int ARGBInterpolate(const uint8_t* src_argb0,
int src_stride_argb0,
- const uint8* src_argb1,
+ const uint8_t* src_argb1,
int src_stride_argb1,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
@@ -2434,23 +2883,23 @@
// Interpolate 2 YUV images by specified amount (0 to 255).
LIBYUV_API
-int I420Interpolate(const uint8* src0_y,
+int I420Interpolate(const uint8_t* src0_y,
int src0_stride_y,
- const uint8* src0_u,
+ const uint8_t* src0_u,
int src0_stride_u,
- const uint8* src0_v,
+ const uint8_t* src0_v,
int src0_stride_v,
- const uint8* src1_y,
+ const uint8_t* src1_y,
int src1_stride_y,
- const uint8* src1_u,
+ const uint8_t* src1_u,
int src1_stride_u,
- const uint8* src1_v,
+ const uint8_t* src1_v,
int src1_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height,
@@ -2472,16 +2921,16 @@
// Shuffle ARGB channel order. e.g. BGRA to ARGB.
LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra,
+int ARGBShuffle(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- const uint8* shuffler,
+ const uint8_t* shuffler,
int width,
int height) {
int y;
- void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
- const uint8* shuffler, int width) = ARGBShuffleRow_C;
+ void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+ const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -2497,14 +2946,6 @@
height = 1;
src_stride_bgra = dst_stride_argb = 0;
}
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2)) {
- ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
- if (IS_ALIGNED(width, 4)) {
- ARGBShuffleRow = ARGBShuffleRow_SSE2;
- }
- }
-#endif
#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -2537,6 +2978,14 @@
}
}
#endif
+#if defined(HAS_ARGBSHUFFLEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBShuffleRow = ARGBShuffleRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -2547,23 +2996,23 @@
}
// Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb,
+static int ARGBSobelize(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
- void (*SobelRow)(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst,
+ void (*SobelRow)(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst,
int width)) {
int y;
- void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+ void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
ARGBToYJRow_C;
- void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely,
- int width) = SobelYRow_C;
- void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobely, int width) =
+ void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+ uint8_t* dst_sobely, int width) = SobelYRow_C;
+ void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+ const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
SobelXRow_C;
const int kEdge = 16; // Extra pixels at start of row for extrude/align.
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
@@ -2608,6 +3057,14 @@
}
}
#endif
+#if defined(HAS_ARGBTOYJROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBToYJRow = ARGBToYJRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SOBELYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2619,6 +3076,16 @@
SobelYRow = SobelYRow_NEON;
}
#endif
+#if defined(HAS_SOBELYROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelYRow = SobelYRow_MSA;
+ }
+#endif
+#if defined(HAS_SOBELYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelYRow = SobelYRow_MMI;
+ }
+#endif
#if defined(HAS_SOBELXROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXRow = SobelXRow_SSE2;
@@ -2629,18 +3096,28 @@
SobelXRow = SobelXRow_NEON;
}
#endif
+#if defined(HAS_SOBELXROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SobelXRow = SobelXRow_MSA;
+ }
+#endif
+#if defined(HAS_SOBELXROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXRow = SobelXRow_MMI;
+ }
+#endif
{
// 3 rows with edges before/after.
const int kRowSize = (width + kEdge + 31) & ~31;
align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
- uint8* row_sobelx = rows;
- uint8* row_sobely = rows + kRowSize;
- uint8* row_y = rows + kRowSize * 2;
+ uint8_t* row_sobelx = rows;
+ uint8_t* row_sobely = rows + kRowSize;
+ uint8_t* row_y = rows + kRowSize * 2;
// Convert first row.
- uint8* row_y0 = row_y + kEdge;
- uint8* row_y1 = row_y0 + kRowSize;
- uint8* row_y2 = row_y1 + kRowSize;
+ uint8_t* row_y0 = row_y + kEdge;
+ uint8_t* row_y1 = row_y0 + kRowSize;
+ uint8_t* row_y2 = row_y1 + kRowSize;
ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
@@ -2664,7 +3141,7 @@
// Cycle thru circular queue of 3 row_y buffers.
{
- uint8* row_yt = row_y0;
+ uint8_t* row_yt = row_y0;
row_y0 = row_y1;
row_y1 = row_y2;
row_y2 = row_yt;
@@ -2679,14 +3156,14 @@
// Sobel ARGB effect.
LIBYUV_API
-int ARGBSobel(const uint8* src_argb,
+int ARGBSobel(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelRow_C;
+ void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelRow = SobelRow_Any_SSE2;
@@ -2711,20 +3188,28 @@
}
}
#endif
+#if defined(HAS_SOBELROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelRow = SobelRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_MMI;
+ }
+ }
+#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelRow);
}
// Sobel ARGB effect with planar output.
LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb,
+int ARGBSobelToPlane(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
- void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_, int width) = SobelToPlaneRow_C;
+ void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
@@ -2749,6 +3234,14 @@
}
}
#endif
+#if defined(HAS_SOBELTOPLANEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelToPlaneRow = SobelToPlaneRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelToPlaneRow = SobelToPlaneRow_MMI;
+ }
+ }
+#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
height, SobelToPlaneRow);
}
@@ -2756,14 +3249,14 @@
// SobelXY ARGB effect.
// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb,
+int ARGBSobelXY(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
- void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelXYRow_C;
+ void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+ uint8_t* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SobelXYRow = SobelXYRow_Any_SSE2;
@@ -2788,21 +3281,29 @@
}
}
#endif
+#if defined(HAS_SOBELXYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SobelXYRow = SobelXYRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_MMI;
+ }
+ }
+#endif
return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height, SobelXYRow);
}
// Apply a 4x4 polynomial to each ARGB pixel.
LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
const float* poly,
int width,
int height) {
int y;
- void (*ARGBPolynomialRow)(const uint8* src_argb, uint8* dst_argb,
+ void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
const float* poly, int width) = ARGBPolynomialRow_C;
if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
return -1;
@@ -2842,16 +3343,16 @@
// Convert plane of 16 bit shorts to half floats.
// Source values are multiplied by scale before storing as half float.
LIBYUV_API
-int HalfFloatPlane(const uint16* src_y,
+int HalfFloatPlane(const uint16_t* src_y,
int src_stride_y,
- uint16* dst_y,
+ uint16_t* dst_y,
int dst_stride_y,
float scale,
int width,
int height) {
int y;
- void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
- HalfFloatRow_C;
+ void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+ int width) = HalfFloatRow_C;
if (!src_y || !dst_y || width <= 0 || height == 0) {
return -1;
}
@@ -2903,6 +3404,14 @@
}
}
#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ HalfFloatRow = HalfFloatRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ HalfFloatRow = HalfFloatRow_MSA;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
HalfFloatRow(src_y, dst_y, scale, width);
@@ -2912,19 +3421,40 @@
return 0;
}
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+ void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+ int width) = ByteToFloatRow_C;
+ if (!src_y || !dst_y || width <= 0) {
+ return -1;
+ }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ByteToFloatRow = ByteToFloatRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ByteToFloatRow = ByteToFloatRow_NEON;
+ }
+ }
+#endif
+
+ ByteToFloatRow(src_y, dst_y, scale, width);
+ return 0;
+}
+
// Apply a lumacolortable to each ARGB pixel.
LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb,
+int ARGBLumaColorTable(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- const uint8* luma,
+ const uint8_t* luma,
int width,
int height) {
int y;
void (*ARGBLumaColorTableRow)(
- const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma,
- const uint32 lumacoeff) = ARGBLumaColorTableRow_C;
+ const uint8_t* src_argb, uint8_t* dst_argb, int width,
+ const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
return -1;
}
@@ -2956,15 +3486,15 @@
// Copy Alpha from one ARGB image to another.
LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb,
+int ARGBCopyAlpha(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
- ARGBCopyAlphaRow_C;
+ void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+ int width) = ARGBCopyAlphaRow_C;
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -2996,6 +3526,14 @@
}
}
#endif
+#if defined(HAS_ARGBCOPYALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBCopyAlphaRow(src_argb, dst_argb, width);
@@ -3007,10 +3545,10 @@
// Extract just the alpha channel from ARGB.
LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb,
- int src_stride,
- uint8* dst_a,
- int dst_stride,
+int ARGBExtractAlpha(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_a,
+ int dst_stride_a,
int width,
int height) {
if (!src_argb || !dst_a || width <= 0 || height == 0) {
@@ -3019,17 +3557,17 @@
// Negative height means invert the image.
if (height < 0) {
height = -height;
- src_argb += (height - 1) * src_stride;
- src_stride = -src_stride;
+ src_argb += (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
}
// Coalesce rows.
- if (src_stride == width * 4 && dst_stride == width) {
+ if (src_stride_argb == width * 4 && dst_stride_a == width) {
width *= height;
height = 1;
- src_stride = dst_stride = 0;
+ src_stride_argb = dst_stride_a = 0;
}
- void (*ARGBExtractAlphaRow)(const uint8* src_argb, uint8* dst_a, int width) =
- ARGBExtractAlphaRow_C;
+ void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+ int width) = ARGBExtractAlphaRow_C;
#if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
@@ -3048,26 +3586,38 @@
: ARGBExtractAlphaRow_Any_NEON;
}
#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+ : ARGBExtractAlphaRow_Any_MSA;
+ }
+#endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI
+ : ARGBExtractAlphaRow_Any_MMI;
+ }
+#endif
for (int y = 0; y < height; ++y) {
ARGBExtractAlphaRow(src_argb, dst_a, width);
- src_argb += src_stride;
- dst_a += dst_stride;
+ src_argb += src_stride_argb;
+ dst_a += dst_stride_a;
}
return 0;
}
// Copy a planar Y channel to the alpha channel of a destination ARGB image.
LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y,
+int ARGBCopyYToAlpha(const uint8_t* src_y,
int src_stride_y,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
- void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
- ARGBCopyYToAlphaRow_C;
+ void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+ int width) = ARGBCopyYToAlphaRow_C;
if (!src_y || !dst_argb || width <= 0 || height == 0) {
return -1;
}
@@ -3099,6 +3649,14 @@
}
}
#endif
+#if defined(HAS_ARGBCOPYYTOALPHAROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI;
+ }
+ }
+#endif
for (y = 0; y < height; ++y) {
ARGBCopyYToAlphaRow(src_y, dst_argb, width);
@@ -3112,19 +3670,19 @@
// directly. A SplitUVRow_Odd function could copy the remaining chroma.
LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2,
+int YUY2ToNV12(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
@@ -3160,6 +3718,22 @@
}
}
#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3192,6 +3766,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
@@ -3220,19 +3802,19 @@
}
LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy,
+int UYVYToNV12(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
- void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
- void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
@@ -3268,6 +3850,22 @@
}
}
#endif
+#if defined(HAS_SPLITUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ SplitUVRow = SplitUVRow_Any_MSA;
+ if (IS_ALIGNED(width, 32)) {
+ SplitUVRow = SplitUVRow_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ SplitUVRow = SplitUVRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ SplitUVRow = SplitUVRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -3300,6 +3898,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
{
int awidth = halfwidth * 2;
diff --git a/files/source/rotate.cc b/files/source/rotate.cc
index 4330884..d414186 100644
--- a/files/source/rotate.cc
+++ b/files/source/rotate.cc
@@ -22,18 +22,18 @@
#endif
LIBYUV_API
-void TransposePlane(const uint8* src,
+void TransposePlane(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
int i = height;
#if defined(HAS_TRANSPOSEWX16_MSA)
- void (*TransposeWx16)(const uint8* src, int src_stride, uint8* dst,
+ void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C;
#else
- void (*TransposeWx8)(const uint8* src, int src_stride, uint8* dst,
+ void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx8_C;
#endif
#if defined(HAS_TRANSPOSEWX8_NEON)
@@ -49,6 +49,11 @@
}
}
#endif
+#if defined(HAS_TRANSPOSEWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeWx8 = TransposeWx8_MMI;
+ }
+#endif
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
TransposeWx8 = TransposeWx8_Fast_Any_SSSE3;
@@ -57,16 +62,6 @@
}
}
#endif
-#if defined(HAS_TRANSPOSEWX8_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) &&
- IS_ALIGNED(src_stride, 4)) {
- TransposeWx8 = TransposeWx8_Fast_DSPR2;
- } else {
- TransposeWx8 = TransposeWx8_DSPR2;
- }
- }
-#endif
#if defined(HAS_TRANSPOSEWX16_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
TransposeWx16 = TransposeWx16_Any_MSA;
@@ -100,9 +95,9 @@
}
LIBYUV_API
-void RotatePlane90(const uint8* src,
+void RotatePlane90(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
@@ -115,9 +110,9 @@
}
LIBYUV_API
-void RotatePlane270(const uint8* src,
+void RotatePlane270(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
@@ -130,20 +125,20 @@
}
LIBYUV_API
-void RotatePlane180(const uint8* src,
+void RotatePlane180(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width);
- const uint8* src_bot = src + src_stride * (height - 1);
- uint8* dst_bot = dst + dst_stride * (height - 1);
+ const uint8_t* src_bot = src + src_stride * (height - 1);
+ uint8_t* dst_bot = dst + dst_stride * (height - 1);
int half_height = (height + 1) >> 1;
int y;
- void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
@@ -168,14 +163,6 @@
}
}
#endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst, 4) &&
- IS_ALIGNED(dst_stride, 4)) {
- MirrorRow = MirrorRow_DSPR2;
- }
-#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
@@ -184,6 +171,14 @@
}
}
#endif
+#if defined(HAS_MIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ MirrorRow = MirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 8)) {
+ MirrorRow = MirrorRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -204,9 +199,9 @@
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
+#if defined(HAS_COPYROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI;
}
#endif
@@ -224,22 +219,22 @@
}
LIBYUV_API
-void TransposeUV(const uint8* src,
+void TransposeUV(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
int i = height;
#if defined(HAS_TRANSPOSEUVWX16_MSA)
- void (*TransposeUVWx16)(const uint8* src, int src_stride, uint8* dst_a,
- int dst_stride_a, uint8* dst_b, int dst_stride_b,
+ void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx16_C;
#else
- void (*TransposeUVWx8)(const uint8* src, int src_stride, uint8* dst_a,
- int dst_stride_a, uint8* dst_b, int dst_stride_b,
+ void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
int width) = TransposeUVWx8_C;
#endif
#if defined(HAS_TRANSPOSEUVWX8_NEON)
@@ -255,10 +250,12 @@
}
}
#endif
-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) &&
- IS_ALIGNED(src_stride, 4)) {
- TransposeUVWx8 = TransposeUVWx8_DSPR2;
+#if defined(HAS_TRANSPOSEUVWX8_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ TransposeUVWx8 = TransposeUVWx8_Any_MMI;
+ if (IS_ALIGNED(width, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_MMI;
+ }
}
#endif
#if defined(HAS_TRANSPOSEUVWX16_MSA)
@@ -299,11 +296,11 @@
}
LIBYUV_API
-void RotateUV90(const uint8* src,
+void RotateUV90(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
@@ -315,11 +312,11 @@
}
LIBYUV_API
-void RotateUV270(const uint8* src,
+void RotateUV270(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
@@ -334,17 +331,17 @@
// Rotate 180 is a horizontal and vertical flip.
LIBYUV_API
-void RotateUV180(const uint8* src,
+void RotateUV180(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
int i;
- void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
- MirrorUVRow_C;
+ void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+ int width) = MirrorUVRow_C;
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
MirrorUVRow = MirrorUVRow_NEON;
@@ -355,10 +352,14 @@
MirrorUVRow = MirrorUVRow_SSSE3;
}
#endif
-#if defined(HAS_MIRRORUVROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src, 4) &&
- IS_ALIGNED(src_stride, 4)) {
- MirrorUVRow = MirrorUVRow_DSPR2;
+#if defined(HAS_MIRRORUVROW_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+ MirrorUVRow = MirrorUVRow_MSA;
+ }
+#endif
+#if defined(HAS_MIRRORUVROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) {
+ MirrorUVRow = MirrorUVRow_MMI;
}
#endif
@@ -374,9 +375,9 @@
}
LIBYUV_API
-int RotatePlane(const uint8* src,
+int RotatePlane(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height,
@@ -413,17 +414,17 @@
}
LIBYUV_API
-int I420Rotate(const uint8* src_y,
+int I420Rotate(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height,
@@ -481,15 +482,75 @@
}
LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y,
+int I444Rotate(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int width,
+ int height,
+ enum libyuv::RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+ !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case libyuv::kRotate0:
+ // copy frame
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate90:
+ RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate270:
+ RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ case libyuv::kRotate180:
+ RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_uv,
+ const uint8_t* src_uv,
int src_stride_uv,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int width,
int height,
diff --git a/files/source/rotate_any.cc b/files/source/rotate_any.cc
index 562096b..b3baf08 100644
--- a/files/source/rotate_any.cc
+++ b/files/source/rotate_any.cc
@@ -19,8 +19,8 @@
#endif
#define TANY(NAMEANY, TPOS_SIMD, MASK) \
- void NAMEANY(const uint8* src, int src_stride, uint8* dst, int dst_stride, \
- int width) { \
+ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \
+ int dst_stride, int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
@@ -35,20 +35,21 @@
#ifdef HAS_TRANSPOSEWX8_SSSE3
TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
#endif
+#ifdef HAS_TRANSPOSEWX8_MMI
+TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7)
+#endif
#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
#endif
-#ifdef HAS_TRANSPOSEWX8_DSPR2
-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
-#endif
#ifdef HAS_TRANSPOSEWX16_MSA
TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
#endif
#undef TANY
#define TUVANY(NAMEANY, TPOS_SIMD, MASK) \
- void NAMEANY(const uint8* src, int src_stride, uint8* dst_a, \
- int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { \
+ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \
+ int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \
+ int width) { \
int r = width & MASK; \
int n = width - r; \
if (n > 0) { \
@@ -64,8 +65,8 @@
#ifdef HAS_TRANSPOSEUVWX8_SSE2
TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
#endif
-#ifdef HAS_TRANSPOSEUVWX8_DSPR2
-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEUVWX8_MMI
+TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7)
#endif
#ifdef HAS_TRANSPOSEUVWX16_MSA
TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
diff --git a/files/source/rotate_argb.cc b/files/source/rotate_argb.cc
index ede4eaf..a93fd55 100644
--- a/files/source/rotate_argb.cc
+++ b/files/source/rotate_argb.cc
@@ -14,113 +14,110 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || \
- (defined(__x86_64__) && !defined(__native_client__)) || \
- defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr,
- int src_stride,
- int src_stepx,
- uint8* dst_ptr,
- int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr,
- int src_stride,
- int src_stepx,
- uint8* dst_ptr,
- int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr,
- int,
- int src_stepx,
- uint8* dst_ptr,
- int dst_width);
-
-static void ARGBTranspose(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
+static void ARGBTranspose(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
int width,
int height) {
int i;
- int src_pixel_step = src_stride >> 2;
- void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
- int src_step, uint8* dst_ptr, int dst_width) =
- ScaleARGBRowDownEven_C;
+ int src_pixel_step = src_stride_argb >> 2;
+ void (*ScaleARGBRowDownEven)(
+ const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+ uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) { // Width of dest.
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
}
#endif
#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) { // Width of dest.
- ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(height, 4)) { // Width of dest.
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI;
+ }
}
#endif
for (i = 0; i < width; ++i) { // column of source to row of dest.
- ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
- dst += dst_stride;
- src += 4;
+ ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+ dst_argb += dst_stride_argb;
+ src_argb += 4;
}
}
-void ARGBRotate90(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
+void ARGBRotate90(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
int width,
int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
- src += src_stride * (height - 1);
- src_stride = -src_stride;
- ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+ src_argb += src_stride_argb * (height - 1);
+ src_stride_argb = -src_stride_argb;
+ ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
}
-void ARGBRotate270(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
+void ARGBRotate270(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
int width,
int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
- dst += dst_stride * (width - 1);
- dst_stride = -dst_stride;
- ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+ dst_argb += dst_stride_argb * (width - 1);
+ dst_stride_argb = -dst_stride_argb;
+ ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+ height);
}
-void ARGBRotate180(const uint8* src,
- int src_stride,
- uint8* dst,
- int dst_stride,
+void ARGBRotate180(const uint8_t* src_argb,
+ int src_stride_argb,
+ uint8_t* dst_argb,
+ int dst_stride_argb,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
- const uint8* src_bot = src + src_stride * (height - 1);
- uint8* dst_bot = dst + dst_stride * (height - 1);
+ const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+ uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
int half_height = (height + 1) >> 1;
int y;
- void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
ARGBMirrorRow_C;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+ CopyRow_C;
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
@@ -153,6 +150,14 @@
}
}
#endif
+#if defined(HAS_ARGBMIRRORROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ARGBMirrorRow = ARGBMirrorRow_Any_MMI;
+ if (IS_ALIGNED(width, 2)) {
+ ARGBMirrorRow = ARGBMirrorRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -173,29 +178,24 @@
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
// Odd height will harmlessly mirror the middle row twice.
for (y = 0; y < half_height; ++y) {
- ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
- ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
- CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
- src += src_stride;
- dst += dst_stride;
- src_bot -= src_stride;
- dst_bot -= dst_stride;
+ ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ src_bot -= src_stride_argb;
+ dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
}
LIBYUV_API
-int ARGBRotate(const uint8* src_argb,
+int ARGBRotate(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
diff --git a/files/source/rotate_common.cc b/files/source/rotate_common.cc
index 89357e7..ff212ad 100644
--- a/files/source/rotate_common.cc
+++ b/files/source/rotate_common.cc
@@ -16,9 +16,9 @@
extern "C" {
#endif
-void TransposeWx8_C(const uint8* src,
+void TransposeWx8_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
int i;
@@ -36,11 +36,11 @@
}
}
-void TransposeUVWx8_C(const uint8* src,
+void TransposeUVWx8_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
int i;
@@ -67,9 +67,9 @@
}
}
-void TransposeWxH_C(const uint8* src,
+void TransposeWxH_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width,
int height) {
@@ -82,11 +82,11 @@
}
}
-void TransposeUVWxH_C(const uint8* src,
+void TransposeUVWxH_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width,
int height) {
diff --git a/files/source/rotate_gcc.cc b/files/source/rotate_gcc.cc
index 74b48ac..04e19e2 100644
--- a/files/source/rotate_gcc.cc
+++ b/files/source/rotate_gcc.cc
@@ -22,9 +22,9 @@
// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
#if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src,
+void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
@@ -112,9 +112,9 @@
// Transpose 16x8. 64 bit
#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src,
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
@@ -255,11 +255,11 @@
// Transpose UV 8x8. 64 bit.
#if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src,
+void TransposeUVWx8_SSE2(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
asm volatile(
diff --git a/files/source/rotate_mmi.cc b/files/source/rotate_mmi.cc
new file mode 100644
index 0000000..f8de608
--- /dev/null
+++ b/files/source/rotate_mmi.cc
@@ -0,0 +1,291 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+void TransposeWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst,
+ int dst_stride,
+ int width) {
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+ uint8_t* src_tmp = nullptr;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "ldc1 %[tmp12], 0x00(%[src]) \n\t"
+ "dadd %[src_tmp], %[src], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (00 10 01 11 02 12 03 13) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (04 14 05 15 06 16 07 17) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (20 30 21 31 22 32 23 33) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (24 34 25 35 26 36 27 37) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp4 = (00 10 20 30 01 11 21 31) */
+ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
+ /* tmp5 = (02 12 22 32 03 13 23 33) */
+ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
+ /* tmp6 = (04 14 24 34 05 15 25 35) */
+ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
+ /* tmp7 = (06 16 26 36 07 17 27 37) */
+ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (40 50 41 51 42 52 43 53) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (44 54 45 55 46 56 47 57) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (60 70 61 71 62 72 63 73) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (64 74 65 75 66 76 67 77) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp8 = (40 50 60 70 41 51 61 71) */
+ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
+ /* tmp9 = (42 52 62 72 43 53 63 73) */
+ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
+ /* tmp10 = (44 54 64 74 45 55 65 75) */
+ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
+ /* tmp11 = (46 56 66 76 47 57 67 77) */
+ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
+
+ /* tmp0 = (00 10 20 30 40 50 60 70) */
+ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
+ /* tmp1 = (01 11 21 31 41 51 61 71) */
+ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (02 12 22 32 42 52 62 72) */
+ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
+ /* tmp1 = (03 13 23 33 43 53 63 73) */
+ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (04 14 24 34 44 54 64 74) */
+ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
+ /* tmp1 = (05 15 25 35 45 55 65 75) */
+ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ /* tmp0 = (06 16 26 36 46 56 66 76) */
+ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
+ /* tmp1 = (07 17 27 37 47 57 67 77) */
+ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t"
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t"
+
+ "dadd %[dst], %[dst], %[dst_stride] \n\t"
+ "daddi %[src], %[src], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+ [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+ [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+ [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst),
+ [src_tmp] "+&r"(src_tmp)
+ : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride),
+ [dst_stride] "r"(dst_stride)
+ : "memory");
+}
+
+void TransposeUVWx8_MMI(const uint8_t* src,
+ int src_stride,
+ uint8_t* dst_a,
+ int dst_stride_a,
+ uint8_t* dst_b,
+ int dst_stride_b,
+ int width) {
+ uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+ uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13;
+ uint8_t* src_tmp = nullptr;
+
+ __asm__ volatile(
+ "1: \n\t"
+ /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */
+ "ldc1 %[tmp12], 0x00(%[src]) \n\t"
+ "dadd %[src_tmp], %[src], %[src_stride] \n\t"
+ /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */
+ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t"
+ /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */
+ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t"
+ /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */
+ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t"
+ /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */
+ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */
+ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t"
+ /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */
+ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t"
+
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */
+ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t"
+ /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */
+ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t"
+ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t"
+
+ /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */
+ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t"
+ /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */
+ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t"
+
+ /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */
+ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t"
+ /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */
+ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t"
+ /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */
+ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t"
+ /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */
+ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t"
+
+ /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */
+ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t"
+ /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */
+ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */
+ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t"
+ /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */
+ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */
+ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t"
+ /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */
+ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */
+ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t"
+ /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */
+ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t"
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t"
+ "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t"
+ "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t"
+
+ "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t"
+ "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t"
+ "daddiu %[src], %[src], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+
+ : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5),
+ [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8),
+ [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11),
+ [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a),
+ [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp)
+ : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a),
+ [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride)
+ : "memory");
+}
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/rotate_msa.cc b/files/source/rotate_msa.cc
index 8907765..99bdca6 100644
--- a/files/source/rotate_msa.cc
+++ b/files/source/rotate_msa.cc
@@ -51,9 +51,9 @@
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
}
-void TransposeWx16_C(const uint8* src,
+void TransposeWx16_C(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
TransposeWx8_C(src, src_stride, dst, dst_stride, width);
@@ -61,11 +61,11 @@
width);
}
-void TransposeUVWx16_C(const uint8* src,
+void TransposeUVWx16_C(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
@@ -74,13 +74,13 @@
dst_stride_a, (dst_b + 8), dst_stride_b, width);
}
-void TransposeWx16_MSA(const uint8* src,
+void TransposeWx16_MSA(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
int x;
- const uint8* s;
+ const uint8_t* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
@@ -153,15 +153,15 @@
}
}
-void TransposeUVWx16_MSA(const uint8* src,
+void TransposeUVWx16_MSA(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
int x;
- const uint8* s;
+ const uint8_t* s;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
diff --git a/files/source/rotate_neon.cc b/files/source/rotate_neon.cc
index ef5c235..fdc0dd4 100644
--- a/files/source/rotate_neon.cc
+++ b/files/source/rotate_neon.cc
@@ -21,40 +21,32 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
-static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
- 2, 6, 10, 14, 3, 7, 11, 15};
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
-void TransposeWx8_NEON(const uint8* src,
+void TransposeWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
- const uint8* src_temp;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %5, #8 \n"
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %5, #8 \n"
- // handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
"mov %0, %1 \n"
- MEMACCESS(0)
"vld1.8 {d0}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d1}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d2}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d3}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d4}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d5}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d6}, [%0], %2 \n"
- MEMACCESS(0)
"vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n"
@@ -79,21 +71,13 @@
"mov %0, %3 \n"
- MEMACCESS(0)
"vst1.8 {d1}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d3}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d5}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d7}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8
@@ -101,183 +85,138 @@
"subs %5, #8 \n" // w -= 8
"bge 1b \n"
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %5, #8 \n"
- "beq 4f \n"
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %5, #8 \n"
+ "beq 4f \n"
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %5, #2 \n"
- "blt 3f \n"
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %5, #2 \n"
+ "blt 3f \n"
- "cmp %5, #4 \n"
- "blt 2f \n"
+ "cmp %5, #4 \n"
+ "blt 2f \n"
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.32 {d3[1]}, [%0] \n"
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.32 {d0[0]}, [%0], %2 \n"
+ "vld1.32 {d0[1]}, [%0], %2 \n"
+ "vld1.32 {d1[0]}, [%0], %2 \n"
+ "vld1.32 {d1[1]}, [%0], %2 \n"
+ "vld1.32 {d2[0]}, [%0], %2 \n"
+ "vld1.32 {d2[1]}, [%0], %2 \n"
+ "vld1.32 {d3[0]}, [%0], %2 \n"
+ "vld1.32 {d3[1]}, [%0] \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(6)
- "vld1.8 {q3}, [%6] \n"
+ "vld1.8 {q3}, [%6] \n"
- "vtbl.8 d4, {d0, d1}, d6 \n"
- "vtbl.8 d5, {d0, d1}, d7 \n"
- "vtbl.8 d0, {d2, d3}, d6 \n"
- "vtbl.8 d1, {d2, d3}, d7 \n"
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
- // TODO(frkoenig): Rework shuffle above to
- // write out with 4 instead of 8 writes.
- MEMACCESS(0)
- "vst1.32 {d4[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d4[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d5[1]}, [%0] \n"
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "vst1.32 {d4[0]}, [%0], %4 \n"
+ "vst1.32 {d4[1]}, [%0], %4 \n"
+ "vst1.32 {d5[0]}, [%0], %4 \n"
+ "vst1.32 {d5[1]}, [%0] \n"
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d0[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d1[1]}, [%0] \n"
+ "add %0, %3, #4 \n"
+ "vst1.32 {d0[0]}, [%0], %4 \n"
+ "vst1.32 {d0[1]}, [%0], %4 \n"
+ "vst1.32 {d1[0]}, [%0], %4 \n"
+ "vst1.32 {d1[1]}, [%0] \n"
- "add %1, #4 \n" // src += 4
- "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
- "subs %5, #4 \n" // w -= 4
- "beq 4f \n"
+ "add %1, #4 \n" // src += 4
+ "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %5, #4 \n" // w -= 4
+ "beq 4f \n"
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %5, #2 \n"
- "blt 3f \n"
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %5, #2 \n"
+ "blt 3f \n"
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.16 {d0[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d0[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.16 {d1[3]}, [%0] \n"
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld1.16 {d0[0]}, [%0], %2 \n"
+ "vld1.16 {d1[0]}, [%0], %2 \n"
+ "vld1.16 {d0[1]}, [%0], %2 \n"
+ "vld1.16 {d1[1]}, [%0], %2 \n"
+ "vld1.16 {d0[2]}, [%0], %2 \n"
+ "vld1.16 {d1[2]}, [%0], %2 \n"
+ "vld1.16 {d0[3]}, [%0], %2 \n"
+ "vld1.16 {d1[3]}, [%0] \n"
- "vtrn.8 d0, d1 \n"
+ "vtrn.8 d0, d1 \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d1}, [%0] \n"
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d1}, [%0] \n"
- "add %1, #2 \n" // src += 2
- "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
- "subs %5, #2 \n" // w -= 2
- "beq 4f \n"
+ "add %1, #2 \n" // src += 2
+ "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %5, #2 \n" // w -= 2
+ "beq 4f \n"
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld1.8 {d0[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld1.8 {d0[7]}, [%1] \n"
+ // 1x8 block
+ "3: \n"
+ "vld1.8 {d0[0]}, [%1], %2 \n"
+ "vld1.8 {d0[1]}, [%1], %2 \n"
+ "vld1.8 {d0[2]}, [%1], %2 \n"
+ "vld1.8 {d0[3]}, [%1], %2 \n"
+ "vld1.8 {d0[4]}, [%1], %2 \n"
+ "vld1.8 {d0[5]}, [%1], %2 \n"
+ "vld1.8 {d0[6]}, [%1], %2 \n"
+ "vld1.8 {d0[7]}, [%1] \n"
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
+ "vst1.64 {d0}, [%3] \n"
- "4: \n"
+ "4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst), // %3
- "+r"(dst_stride), // %4
- "+r"(width) // %5
- : "r"(&kVTbl4x4Transpose) // %6
- : "memory", "cc", "q0", "q1", "q2", "q3"
- );
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst), // %3
+ "+r"(dst_stride), // %4
+ "+r"(width) // %5
+ : "r"(&kVTbl4x4Transpose) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3");
}
-static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
- 4, 12, 5, 13, 6, 14, 7, 15};
+static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
+ 4, 12, 5, 13, 6, 14, 7, 15};
-void TransposeUVWx8_NEON(const uint8* src,
+void TransposeUVWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
- const uint8* src_temp;
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %7, #8 \n"
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %7, #8 \n"
- // handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
"mov %0, %1 \n"
- MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d2, d3}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d4, d5}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d6, d7}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d16, d17}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d18, d19}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d20, d21}, [%0], %2 \n"
- MEMACCESS(0)
"vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n"
@@ -306,40 +245,24 @@
"mov %0, %3 \n"
- MEMACCESS(0)
"vst1.8 {d2}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d0}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d6}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d4}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d18}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d16}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d22}, [%0], %4 \n"
- MEMACCESS(0)
"vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n"
- MEMACCESS(0)
"vst1.8 {d3}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d1}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d7}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d5}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d19}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d17}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d23}, [%0], %6 \n"
- MEMACCESS(0)
"vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2
@@ -348,187 +271,142 @@
"subs %7, #8 \n" // w -= 8
"bge 1b \n"
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %7, #8 \n"
- "beq 4f \n"
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %7, #8 \n"
+ "beq 4f \n"
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %7, #2 \n"
- "blt 3f \n"
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %7, #2 \n"
+ "blt 3f \n"
- "cmp %7, #4 \n"
- "blt 2f \n"
+ "cmp %7, #4 \n"
+ "blt 2f \n"
- // TODO(frkoenig): Clean this up
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld1.64 {d0}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d1}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d2}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d3}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d4}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d5}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d6}, [%0], %2 \n"
- MEMACCESS(0)
- "vld1.64 {d7}, [%0] \n"
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "vld1.64 {d0}, [%0], %2 \n"
+ "vld1.64 {d1}, [%0], %2 \n"
+ "vld1.64 {d2}, [%0], %2 \n"
+ "vld1.64 {d3}, [%0], %2 \n"
+ "vld1.64 {d4}, [%0], %2 \n"
+ "vld1.64 {d5}, [%0], %2 \n"
+ "vld1.64 {d6}, [%0], %2 \n"
+ "vld1.64 {d7}, [%0] \n"
- MEMACCESS(8)
- "vld1.8 {q15}, [%8] \n"
+ "vld1.8 {q15}, [%8] \n"
- "vtrn.8 q0, q1 \n"
- "vtrn.8 q2, q3 \n"
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
- "vtbl.8 d16, {d0, d1}, d30 \n"
- "vtbl.8 d17, {d0, d1}, d31 \n"
- "vtbl.8 d18, {d2, d3}, d30 \n"
- "vtbl.8 d19, {d2, d3}, d31 \n"
- "vtbl.8 d20, {d4, d5}, d30 \n"
- "vtbl.8 d21, {d4, d5}, d31 \n"
- "vtbl.8 d22, {d6, d7}, d30 \n"
- "vtbl.8 d23, {d6, d7}, d31 \n"
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "vst1.32 {d16[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d16[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d17[1]}, [%0], %4 \n"
+ "vst1.32 {d16[0]}, [%0], %4 \n"
+ "vst1.32 {d16[1]}, [%0], %4 \n"
+ "vst1.32 {d17[0]}, [%0], %4 \n"
+ "vst1.32 {d17[1]}, [%0], %4 \n"
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d20[1]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[0]}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.32 {d21[1]}, [%0] \n"
+ "add %0, %3, #4 \n"
+ "vst1.32 {d20[0]}, [%0], %4 \n"
+ "vst1.32 {d20[1]}, [%0], %4 \n"
+ "vst1.32 {d21[0]}, [%0], %4 \n"
+ "vst1.32 {d21[1]}, [%0] \n"
- "mov %0, %5 \n"
+ "mov %0, %5 \n"
- MEMACCESS(0)
- "vst1.32 {d18[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d18[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d19[1]}, [%0], %6 \n"
+ "vst1.32 {d18[0]}, [%0], %6 \n"
+ "vst1.32 {d18[1]}, [%0], %6 \n"
+ "vst1.32 {d19[0]}, [%0], %6 \n"
+ "vst1.32 {d19[1]}, [%0], %6 \n"
- "add %0, %5, #4 \n"
- MEMACCESS(0)
- "vst1.32 {d22[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d22[1]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[0]}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.32 {d23[1]}, [%0] \n"
+ "add %0, %5, #4 \n"
+ "vst1.32 {d22[0]}, [%0], %6 \n"
+ "vst1.32 {d22[1]}, [%0], %6 \n"
+ "vst1.32 {d23[0]}, [%0], %6 \n"
+ "vst1.32 {d23[1]}, [%0] \n"
- "add %1, #4*2 \n" // src += 4 * 2
- "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
- "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
- "subs %7, #4 \n" // w -= 4
- "beq 4f \n"
+ "add %1, #4*2 \n" // src += 4 * 2
+ "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
+ // dst_stride_b
+ "subs %7, #4 \n" // w -= 4
+ "beq 4f \n"
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %7, #2 \n"
- "blt 3f \n"
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %7, #2 \n"
+ "blt 3f \n"
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
- MEMACCESS(0)
- "vld2.16 {d1[3], d3[3]}, [%0] \n"
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ "vld2.16 {d1[3], d3[3]}, [%0] \n"
- "vtrn.8 d0, d1 \n"
- "vtrn.8 d2, d3 \n"
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "vst1.64 {d0}, [%0], %4 \n"
- MEMACCESS(0)
- "vst1.64 {d2}, [%0] \n"
+ "vst1.64 {d0}, [%0], %4 \n"
+ "vst1.64 {d2}, [%0] \n"
- "mov %0, %5 \n"
+ "mov %0, %5 \n"
- MEMACCESS(0)
- "vst1.64 {d1}, [%0], %6 \n"
- MEMACCESS(0)
- "vst1.64 {d3}, [%0] \n"
+ "vst1.64 {d1}, [%0], %6 \n"
+ "vst1.64 {d3}, [%0] \n"
- "add %1, #2*2 \n" // src += 2 * 2
- "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
- "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
- "subs %7, #2 \n" // w -= 2
- "beq 4f \n"
+ "add %1, #2*2 \n" // src += 2 * 2
+ "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
+ // dst_stride_a
+ "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
+ // dst_stride_b
+ "subs %7, #2 \n" // w -= 2
+ "beq 4f \n"
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
- MEMACCESS(1)
- "vld2.8 {d0[7], d1[7]}, [%1] \n"
+ // 1x8 block
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ "vld2.8 {d0[7], d1[7]}, [%1] \n"
- MEMACCESS(3)
- "vst1.64 {d0}, [%3] \n"
- MEMACCESS(5)
- "vst1.64 {d1}, [%5] \n"
+ "vst1.64 {d0}, [%3] \n"
+ "vst1.64 {d1}, [%5] \n"
- "4: \n"
+ "4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(src_stride), // %2
- "+r"(dst_a), // %3
- "+r"(dst_stride_a), // %4
- "+r"(dst_b), // %5
- "+r"(dst_stride_b), // %6
- "+r"(width) // %7
- : "r"(&kVTbl4x4TransposeDi) // %8
- : "memory", "cc",
- "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_a), // %3
+ "+r"(dst_stride_a), // %4
+ "+r"(dst_b), // %5
+ "+r"(dst_stride_b), // %6
+ "+r"(width) // %7
+ : "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
diff --git a/files/source/rotate_neon64.cc b/files/source/rotate_neon64.cc
index f52b0ed..f469baa 100644
--- a/files/source/rotate_neon64.cc
+++ b/files/source/rotate_neon64.cc
@@ -21,41 +21,32 @@
// This module is for GCC Neon armv8 64 bit.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
- 2, 6, 10, 14, 3, 7, 11, 15};
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
+ 2, 6, 10, 14, 3, 7, 11, 15};
-void TransposeWx8_NEON(const uint8* src,
+void TransposeWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
- const uint8* src_temp;
- int64 width64 = (int64)width; // Work around clang 3.4 warning.
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %3, %3, #8 \n"
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w3, %w3, #8 \n"
- // handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
"mov %0, %1 \n"
- MEMACCESS(0)
"ld1 {v0.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v1.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v2.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v3.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v4.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v5.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v6.8b}, [%0], %5 \n"
- MEMACCESS(0)
"ld1 {v7.8b}, [%0] \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
@@ -87,459 +78,345 @@
"mov %0, %2 \n"
- MEMACCESS(0)
"st1 {v17.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v16.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v19.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v18.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v21.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v20.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v23.8b}, [%0], %6 \n"
- MEMACCESS(0)
"st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
- "subs %3, %3, #8 \n" // w -= 8
+ "subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n"
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %3, %3, #8 \n"
- "b.eq 4f \n"
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w3, %w3, #8 \n"
+ "b.eq 4f \n"
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %3, #2 \n"
- "b.lt 3f \n"
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- "cmp %3, #4 \n"
- "b.lt 2f \n"
+ "cmp %w3, #4 \n"
+ "b.lt 2f \n"
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[3], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.s}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.s}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.s}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.s}[3], [%0] \n"
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.s}[0], [%0], %5 \n"
+ "ld1 {v0.s}[1], [%0], %5 \n"
+ "ld1 {v0.s}[2], [%0], %5 \n"
+ "ld1 {v0.s}[3], [%0], %5 \n"
+ "ld1 {v1.s}[0], [%0], %5 \n"
+ "ld1 {v1.s}[1], [%0], %5 \n"
+ "ld1 {v1.s}[2], [%0], %5 \n"
+ "ld1 {v1.s}[3], [%0] \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- MEMACCESS(4)
- "ld1 {v2.16b}, [%4] \n"
+ "ld1 {v2.16b}, [%4] \n"
- "tbl v3.16b, {v0.16b}, v2.16b \n"
- "tbl v0.16b, {v1.16b}, v2.16b \n"
+ "tbl v3.16b, {v0.16b}, v2.16b \n"
+ "tbl v0.16b, {v1.16b}, v2.16b \n"
- // TODO(frkoenig): Rework shuffle above to
- // write out with 4 instead of 8 writes.
- MEMACCESS(0)
- "st1 {v3.s}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v3.s}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v3.s}[2], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v3.s}[3], [%0] \n"
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "st1 {v3.s}[0], [%0], %6 \n"
+ "st1 {v3.s}[1], [%0], %6 \n"
+ "st1 {v3.s}[2], [%0], %6 \n"
+ "st1 {v3.s}[3], [%0] \n"
- "add %0, %2, #4 \n"
- MEMACCESS(0)
- "st1 {v0.s}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v0.s}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v0.s}[2], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v0.s}[3], [%0] \n"
+ "add %0, %2, #4 \n"
+ "st1 {v0.s}[0], [%0], %6 \n"
+ "st1 {v0.s}[1], [%0], %6 \n"
+ "st1 {v0.s}[2], [%0], %6 \n"
+ "st1 {v0.s}[3], [%0] \n"
- "add %1, %1, #4 \n" // src += 4
- "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
- "subs %3, %3, #4 \n" // w -= 4
- "b.eq 4f \n"
+ "add %1, %1, #4 \n" // src += 4
+ "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %w3, %w3, #4 \n" // w -= 4
+ "b.eq 4f \n"
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %3, #2 \n"
- "b.lt 3f \n"
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w3, #2 \n"
+ "b.lt 3f \n"
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "ld1 {v0.h}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.h}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.h}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.h}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.h}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.h}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v0.h}[3], [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.h}[3], [%0] \n"
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld1 {v0.h}[0], [%0], %5 \n"
+ "ld1 {v1.h}[0], [%0], %5 \n"
+ "ld1 {v0.h}[1], [%0], %5 \n"
+ "ld1 {v1.h}[1], [%0], %5 \n"
+ "ld1 {v0.h}[2], [%0], %5 \n"
+ "ld1 {v1.h}[2], [%0], %5 \n"
+ "ld1 {v0.h}[3], [%0], %5 \n"
+ "ld1 {v1.h}[3], [%0] \n"
- "trn2 v2.8b, v0.8b, v1.8b \n"
- "trn1 v3.8b, v0.8b, v1.8b \n"
+ "trn2 v2.8b, v0.8b, v1.8b \n"
+ "trn1 v3.8b, v0.8b, v1.8b \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- MEMACCESS(0)
- "st1 {v3.8b}, [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v2.8b}, [%0] \n"
+ "st1 {v3.8b}, [%0], %6 \n"
+ "st1 {v2.8b}, [%0] \n"
- "add %1, %1, #2 \n" // src += 2
- "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
- "subs %3, %3, #2 \n" // w -= 2
- "b.eq 4f \n"
+ "add %1, %1, #2 \n" // src += 2
+ "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %w3, %w3, #2 \n" // w -= 2
+ "b.eq 4f \n"
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "ld1 {v0.b}[0], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[1], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[2], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[3], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[4], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[5], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[6], [%1], %5 \n"
- MEMACCESS(1)
- "ld1 {v0.b}[7], [%1] \n"
+ // 1x8 block
+ "3: \n"
+ "ld1 {v0.b}[0], [%1], %5 \n"
+ "ld1 {v0.b}[1], [%1], %5 \n"
+ "ld1 {v0.b}[2], [%1], %5 \n"
+ "ld1 {v0.b}[3], [%1], %5 \n"
+ "ld1 {v0.b}[4], [%1], %5 \n"
+ "ld1 {v0.b}[5], [%1], %5 \n"
+ "ld1 {v0.b}[6], [%1], %5 \n"
+ "ld1 {v0.b}[7], [%1] \n"
- MEMACCESS(2)
- "st1 {v0.8b}, [%2] \n"
+ "st1 {v0.8b}, [%2] \n"
- "4: \n"
+ "4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(dst), // %2
- "+r"(width64) // %3
- : "r"(&kVTbl4x4Transpose), // %4
- "r"(static_cast<ptrdiff_t>(src_stride)), // %5
- "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
- : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
- "v17", "v18", "v19", "v20", "v21", "v22", "v23"
- );
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst), // %2
+ "+r"(width) // %3
+ : "r"(&kVTbl4x4Transpose), // %4
+ "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
-static uint8 kVTbl4x4TransposeDi[32] = {
+static const uint8_t kVTbl4x4TransposeDi[32] = {
0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
-void TransposeUVWx8_NEON(const uint8* src,
+void TransposeUVWx8_NEON(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int width) {
- const uint8* src_temp;
- int64 width64 = (int64)width; // Work around clang 3.4 warning.
- asm volatile (
- // loops are on blocks of 8. loop will stop when
- // counter gets to or below 0. starting the counter
- // at w-8 allow for this
- "sub %4, %4, #8 \n"
+ const uint8_t* src_temp;
+ asm volatile(
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %w4, %w4, #8 \n"
- // handle 8x8 blocks. this should be the majority of the plane
- "1: \n"
- "mov %0, %1 \n"
+ // handle 8x8 blocks. this should be the majority of the plane
+ "1: \n"
+ "mov %0, %1 \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v2.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v3.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v4.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v5.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v6.16b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v7.16b}, [%0] \n"
+ "ld1 {v0.16b}, [%0], %5 \n"
+ "ld1 {v1.16b}, [%0], %5 \n"
+ "ld1 {v2.16b}, [%0], %5 \n"
+ "ld1 {v3.16b}, [%0], %5 \n"
+ "ld1 {v4.16b}, [%0], %5 \n"
+ "ld1 {v5.16b}, [%0], %5 \n"
+ "ld1 {v6.16b}, [%0], %5 \n"
+ "ld1 {v7.16b}, [%0] \n"
- "trn1 v16.16b, v0.16b, v1.16b \n"
- "trn2 v17.16b, v0.16b, v1.16b \n"
- "trn1 v18.16b, v2.16b, v3.16b \n"
- "trn2 v19.16b, v2.16b, v3.16b \n"
- "trn1 v20.16b, v4.16b, v5.16b \n"
- "trn2 v21.16b, v4.16b, v5.16b \n"
- "trn1 v22.16b, v6.16b, v7.16b \n"
- "trn2 v23.16b, v6.16b, v7.16b \n"
+ "trn1 v16.16b, v0.16b, v1.16b \n"
+ "trn2 v17.16b, v0.16b, v1.16b \n"
+ "trn1 v18.16b, v2.16b, v3.16b \n"
+ "trn2 v19.16b, v2.16b, v3.16b \n"
+ "trn1 v20.16b, v4.16b, v5.16b \n"
+ "trn2 v21.16b, v4.16b, v5.16b \n"
+ "trn1 v22.16b, v6.16b, v7.16b \n"
+ "trn2 v23.16b, v6.16b, v7.16b \n"
- "trn1 v0.8h, v16.8h, v18.8h \n"
- "trn2 v1.8h, v16.8h, v18.8h \n"
- "trn1 v2.8h, v20.8h, v22.8h \n"
- "trn2 v3.8h, v20.8h, v22.8h \n"
- "trn1 v4.8h, v17.8h, v19.8h \n"
- "trn2 v5.8h, v17.8h, v19.8h \n"
- "trn1 v6.8h, v21.8h, v23.8h \n"
- "trn2 v7.8h, v21.8h, v23.8h \n"
+ "trn1 v0.8h, v16.8h, v18.8h \n"
+ "trn2 v1.8h, v16.8h, v18.8h \n"
+ "trn1 v2.8h, v20.8h, v22.8h \n"
+ "trn2 v3.8h, v20.8h, v22.8h \n"
+ "trn1 v4.8h, v17.8h, v19.8h \n"
+ "trn2 v5.8h, v17.8h, v19.8h \n"
+ "trn1 v6.8h, v21.8h, v23.8h \n"
+ "trn2 v7.8h, v21.8h, v23.8h \n"
- "trn1 v16.4s, v0.4s, v2.4s \n"
- "trn2 v17.4s, v0.4s, v2.4s \n"
- "trn1 v18.4s, v1.4s, v3.4s \n"
- "trn2 v19.4s, v1.4s, v3.4s \n"
- "trn1 v20.4s, v4.4s, v6.4s \n"
- "trn2 v21.4s, v4.4s, v6.4s \n"
- "trn1 v22.4s, v5.4s, v7.4s \n"
- "trn2 v23.4s, v5.4s, v7.4s \n"
+ "trn1 v16.4s, v0.4s, v2.4s \n"
+ "trn2 v17.4s, v0.4s, v2.4s \n"
+ "trn1 v18.4s, v1.4s, v3.4s \n"
+ "trn2 v19.4s, v1.4s, v3.4s \n"
+ "trn1 v20.4s, v4.4s, v6.4s \n"
+ "trn2 v21.4s, v4.4s, v6.4s \n"
+ "trn1 v22.4s, v5.4s, v7.4s \n"
+ "trn2 v23.4s, v5.4s, v7.4s \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- MEMACCESS(0)
- "st1 {v16.d}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v18.d}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v17.d}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v19.d}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v16.d}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v18.d}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v17.d}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v19.d}[1], [%0] \n"
+ "st1 {v16.d}[0], [%0], %6 \n"
+ "st1 {v18.d}[0], [%0], %6 \n"
+ "st1 {v17.d}[0], [%0], %6 \n"
+ "st1 {v19.d}[0], [%0], %6 \n"
+ "st1 {v16.d}[1], [%0], %6 \n"
+ "st1 {v18.d}[1], [%0], %6 \n"
+ "st1 {v17.d}[1], [%0], %6 \n"
+ "st1 {v19.d}[1], [%0] \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "st1 {v20.d}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v22.d}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v21.d}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v23.d}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v20.d}[1], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v22.d}[1], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v21.d}[1], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v23.d}[1], [%0] \n"
+ "st1 {v20.d}[0], [%0], %7 \n"
+ "st1 {v22.d}[0], [%0], %7 \n"
+ "st1 {v21.d}[0], [%0], %7 \n"
+ "st1 {v23.d}[0], [%0], %7 \n"
+ "st1 {v20.d}[1], [%0], %7 \n"
+ "st1 {v22.d}[1], [%0], %7 \n"
+ "st1 {v21.d}[1], [%0], %7 \n"
+ "st1 {v23.d}[1], [%0] \n"
- "add %1, %1, #16 \n" // src += 8*2
- "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
- "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
- "subs %4, %4, #8 \n" // w -= 8
- "b.ge 1b \n"
+ "add %1, %1, #16 \n" // src += 8*2
+ "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
+ // dst_stride_b
+ "subs %w4, %w4, #8 \n" // w -= 8
+ "b.ge 1b \n"
- // add 8 back to counter. if the result is 0 there are
- // no residuals.
- "adds %4, %4, #8 \n"
- "b.eq 4f \n"
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %w4, %w4, #8 \n"
+ "b.eq 4f \n"
- // some residual, so between 1 and 7 lines left to transpose
- "cmp %4, #2 \n"
- "b.lt 3f \n"
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- "cmp %4, #4 \n"
- "b.lt 2f \n"
+ "cmp %w4, #4 \n"
+ "b.lt 2f \n"
- // TODO(frkoenig): Clean this up
- // 4x8 block
- "mov %0, %1 \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v1.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v3.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v4.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v5.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v6.8b}, [%0], %5 \n"
- MEMACCESS(0)
- "ld1 {v7.8b}, [%0] \n"
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ "ld1 {v0.8b}, [%0], %5 \n"
+ "ld1 {v1.8b}, [%0], %5 \n"
+ "ld1 {v2.8b}, [%0], %5 \n"
+ "ld1 {v3.8b}, [%0], %5 \n"
+ "ld1 {v4.8b}, [%0], %5 \n"
+ "ld1 {v5.8b}, [%0], %5 \n"
+ "ld1 {v6.8b}, [%0], %5 \n"
+ "ld1 {v7.8b}, [%0] \n"
- MEMACCESS(8)
- "ld1 {v30.16b}, [%8], #16 \n"
- "ld1 {v31.16b}, [%8] \n"
+ "ld1 {v30.16b}, [%8], #16 \n"
+ "ld1 {v31.16b}, [%8] \n"
- "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
- "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
- "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
- "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
+ "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
+ "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
+ "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
+ "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- MEMACCESS(0)
- "st1 {v16.s}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v16.s}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v16.s}[2], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v16.s}[3], [%0], %6 \n"
+ "st1 {v16.s}[0], [%0], %6 \n"
+ "st1 {v16.s}[1], [%0], %6 \n"
+ "st1 {v16.s}[2], [%0], %6 \n"
+ "st1 {v16.s}[3], [%0], %6 \n"
- "add %0, %2, #4 \n"
- MEMACCESS(0)
- "st1 {v18.s}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v18.s}[1], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v18.s}[2], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v18.s}[3], [%0] \n"
+ "add %0, %2, #4 \n"
+ "st1 {v18.s}[0], [%0], %6 \n"
+ "st1 {v18.s}[1], [%0], %6 \n"
+ "st1 {v18.s}[2], [%0], %6 \n"
+ "st1 {v18.s}[3], [%0] \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "st1 {v17.s}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v17.s}[1], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v17.s}[2], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v17.s}[3], [%0], %7 \n"
+ "st1 {v17.s}[0], [%0], %7 \n"
+ "st1 {v17.s}[1], [%0], %7 \n"
+ "st1 {v17.s}[2], [%0], %7 \n"
+ "st1 {v17.s}[3], [%0], %7 \n"
- "add %0, %3, #4 \n"
- MEMACCESS(0)
- "st1 {v19.s}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v19.s}[1], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v19.s}[2], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v19.s}[3], [%0] \n"
+ "add %0, %3, #4 \n"
+ "st1 {v19.s}[0], [%0], %7 \n"
+ "st1 {v19.s}[1], [%0], %7 \n"
+ "st1 {v19.s}[2], [%0], %7 \n"
+ "st1 {v19.s}[3], [%0] \n"
- "add %1, %1, #8 \n" // src += 4 * 2
- "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
- "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
- "subs %4, %4, #4 \n" // w -= 4
- "b.eq 4f \n"
+ "add %1, %1, #8 \n" // src += 4 * 2
+ "add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
+ // dst_stride_b
+ "subs %w4, %w4, #4 \n" // w -= 4
+ "b.eq 4f \n"
- // some residual, check to see if it includes a 2x8 block,
- // or less
- "cmp %4, #2 \n"
- "b.lt 3f \n"
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %w4, #2 \n"
+ "b.lt 3f \n"
- // 2x8 block
- "2: \n"
- "mov %0, %1 \n"
- MEMACCESS(0)
- "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
- MEMACCESS(0)
- "ld2 {v2.h, v3.h}[3], [%0] \n"
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
+ "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
+ "ld2 {v2.h, v3.h}[3], [%0] \n"
- "trn1 v4.8b, v0.8b, v2.8b \n"
- "trn2 v5.8b, v0.8b, v2.8b \n"
- "trn1 v6.8b, v1.8b, v3.8b \n"
- "trn2 v7.8b, v1.8b, v3.8b \n"
+ "trn1 v4.8b, v0.8b, v2.8b \n"
+ "trn2 v5.8b, v0.8b, v2.8b \n"
+ "trn1 v6.8b, v1.8b, v3.8b \n"
+ "trn2 v7.8b, v1.8b, v3.8b \n"
- "mov %0, %2 \n"
+ "mov %0, %2 \n"
- MEMACCESS(0)
- "st1 {v4.d}[0], [%0], %6 \n"
- MEMACCESS(0)
- "st1 {v6.d}[0], [%0] \n"
+ "st1 {v4.d}[0], [%0], %6 \n"
+ "st1 {v6.d}[0], [%0] \n"
- "mov %0, %3 \n"
+ "mov %0, %3 \n"
- MEMACCESS(0)
- "st1 {v5.d}[0], [%0], %7 \n"
- MEMACCESS(0)
- "st1 {v7.d}[0], [%0] \n"
+ "st1 {v5.d}[0], [%0], %7 \n"
+ "st1 {v7.d}[0], [%0] \n"
- "add %1, %1, #4 \n" // src += 2 * 2
- "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
- "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
- "subs %4, %4, #2 \n" // w -= 2
- "b.eq 4f \n"
+ "add %1, %1, #4 \n" // src += 2 * 2
+ "add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
+ // dst_stride_a
+ "add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
+ // dst_stride_b
+ "subs %w4, %w4, #2 \n" // w -= 2
+ "b.eq 4f \n"
- // 1x8 block
- "3: \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
- MEMACCESS(1)
- "ld2 {v0.b, v1.b}[7], [%1] \n"
+ // 1x8 block
+ "3: \n"
+ "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
+ "ld2 {v0.b, v1.b}[7], [%1] \n"
- MEMACCESS(2)
- "st1 {v0.d}[0], [%2] \n"
- MEMACCESS(3)
- "st1 {v1.d}[0], [%3] \n"
+ "st1 {v0.d}[0], [%2] \n"
+ "st1 {v1.d}[0], [%3] \n"
- "4: \n"
+ "4: \n"
- : "=&r"(src_temp), // %0
- "+r"(src), // %1
- "+r"(dst_a), // %2
- "+r"(dst_b), // %3
- "+r"(width64) // %4
- : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
- "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
- "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
- "r"(&kVTbl4x4TransposeDi) // %8
- : "memory", "cc",
- "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
- "v30", "v31"
- );
+ : "=&r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
+ "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
+ "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
+ "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/files/source/rotate_win.cc b/files/source/rotate_win.cc
index 93a5c28..e887dd5 100644
--- a/files/source/rotate_win.cc
+++ b/files/source/rotate_win.cc
@@ -17,11 +17,11 @@
#endif
// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
int src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int width) {
__asm {
@@ -112,11 +112,11 @@
}
}
-__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
int src_stride,
- uint8* dst_a,
+ uint8_t* dst_a,
int dst_stride_a,
- uint8* dst_b,
+ uint8_t* dst_b,
int dst_stride_b,
int w) {
__asm {
@@ -172,7 +172,7 @@
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
- // Second round of bit swap.
+ // Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
@@ -192,8 +192,8 @@
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
- // Third round of bit swap.
- // Write to the destination pointer.
+ // Third round of bit swap.
+ // Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4
diff --git a/files/source/row_any.cc b/files/source/row_any.cc
index 1092a9c..06ca723 100644
--- a/files/source/row_any.cc
+++ b/files/source/row_any.cc
@@ -31,25 +31,25 @@
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- const uint8* a_buf, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 5]); \
- memset(temp, 0, 64 * 4); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 192, a_buf + n, r); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
- yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
- SS(r, DUVSHIFT) * BPP); \
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 5]); \
+ memset(temp, 0, 64 * 4); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 192, a_buf + n, r); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \
+ yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
+ SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
@@ -67,86 +67,117 @@
#undef ANY41C
// Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 4]); \
+ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
+ SS(r, DUVSHIFT) * BPP); \
}
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_MMI
+ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
+#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
#endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
#ifdef HAS_I422TOYUY2ROW_NEON
ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_MSA
ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
#endif
+#ifdef HAS_I422TOYUY2ROW_MMI
+ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7)
+#endif
#ifdef HAS_I422TOUYVYROW_NEON
ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
#endif
#ifdef HAS_I422TOUYVYROW_MSA
ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
#endif
+#ifdef HAS_I422TOUYVYROW_MMI
+ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7)
+#endif
#ifdef HAS_BLENDPLANEROW_AVX2
ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
#endif
#ifdef HAS_BLENDPLANEROW_SSSE3
ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
#endif
+#ifdef HAS_BLENDPLANEROW_MMI
+ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7)
+#endif
#undef ANY31
// Note that odd width replication includes 444 due to implementation
// on arm that subsamples 444 to 422 internally.
// Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \
- uint8* dst_ptr, const struct YuvConstants* yuvconstants, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 4]); \
- memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n, r); \
- memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
- if (width & 1) { \
- temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
- temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
- } \
- ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
- SS(r, DUVSHIFT) * BPP); \
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
+ const uint8_t* v_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 4]); \
+ memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r); \
+ memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
+ if (width & 1) { \
+ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
+ temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
+ } \
+ ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
+ MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
+ SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I422TOARGBROW_SSSE3
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
#endif
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
+#endif
#ifdef HAS_I444TOARGBROW_SSSE3
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
@@ -175,12 +206,6 @@
ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
#endif
-#ifdef HAS_I422TOARGBROW_DSPR2
-ANY31C(I444ToARGBRow_Any_DSPR2, I444ToARGBRow_DSPR2, 0, 0, 4, 7)
-ANY31C(I422ToARGBRow_Any_DSPR2, I422ToARGBRow_DSPR2, 1, 0, 4, 7)
-ANY31C(I422ToARGB4444Row_Any_DSPR2, I422ToARGB4444Row_DSPR2, 1, 0, 2, 7)
-ANY31C(I422ToARGB1555Row_Any_DSPR2, I422ToARGB1555Row_DSPR2, 1, 0, 2, 7)
-#endif
#ifdef HAS_I422TOARGBROW_MSA
ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
@@ -192,22 +217,57 @@
#endif
#undef ANY31C
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \
+ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+ int width) { \
+ SIMD_ALIGNED(T temp[16 * 3]); \
+ SIMD_ALIGNED(uint8_t out[64]); \
+ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n, r * SBPP); \
+ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
+ ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
+ }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
// Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
- int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
// Merge functions.
@@ -223,7 +283,15 @@
#ifdef HAS_MERGEUVROW_MSA
ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
#endif
-
+#ifdef HAS_MERGEUVROW_MMI
+ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
// Math functions.
#ifdef HAS_ARGBMULTIPLYROW_SSE2
ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -255,12 +323,21 @@
#ifdef HAS_ARGBMULTIPLYROW_MSA
ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
#endif
+#ifdef HAS_ARGBMULTIPLYROW_MMI
+ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_ARGBADDROW_MSA
ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
#endif
+#ifdef HAS_ARGBADDROW_MMI
+ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_ARGBSUBTRACTROW_MSA
ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
#endif
+#ifdef HAS_ARGBSUBTRACTROW_MMI
+ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1)
+#endif
#ifdef HAS_SOBELROW_SSE2
ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
#endif
@@ -270,6 +347,9 @@
#ifdef HAS_SOBELROW_MSA
ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
#endif
+#ifdef HAS_SOBELROW_MMI
+ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7)
+#endif
#ifdef HAS_SOBELTOPLANEROW_SSE2
ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
#endif
@@ -279,6 +359,9 @@
#ifdef HAS_SOBELTOPLANEROW_MSA
ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
#endif
+#ifdef HAS_SOBELTOPLANEROW_MMI
+ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7)
+#endif
#ifdef HAS_SOBELXYROW_SSE2
ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
#endif
@@ -288,24 +371,27 @@
#ifdef HAS_SOBELXYROW_MSA
ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
#endif
+#ifdef HAS_SOBELXYROW_MMI
+ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7)
+#endif
#undef ANY21
// Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
- void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \
- const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
- } \
- memcpy(temp, y_buf + n * SBPP, r * SBPP); \
- memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
- SS(r, UVSHIFT) * SBPP2); \
- ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
+ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+ const struct YuvConstants* yuvconstants, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \
+ } \
+ memcpy(temp, y_buf + n * SBPP, r * SBPP); \
+ memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
+ SS(r, UVSHIFT) * SBPP2); \
+ ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
}
// Biplanar to RGB.
@@ -318,9 +404,6 @@
#ifdef HAS_NV12TOARGBROW_NEON
ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
#endif
-#ifdef HAS_NV12TOARGBROW_DSPR2
-ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7)
-#endif
#ifdef HAS_NV12TOARGBROW_MSA
ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
@@ -336,6 +419,24 @@
#ifdef HAS_NV21TOARGBROW_MSA
ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
#ifdef HAS_NV12TORGB565ROW_SSSE3
ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
#endif
@@ -352,8 +453,8 @@
// Any 1 to 1.
#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@@ -381,6 +482,15 @@
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif
@@ -388,6 +498,18 @@
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
#if defined(HAS_J400TOARGBROW_SSE2)
ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
#endif
@@ -437,12 +559,24 @@
ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
#endif
+#if defined(HAS_ARGBTORGB24ROW_MMI)
+ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3)
+ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3)
+ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3)
+ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3)
+ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7)
+#endif
#if defined(HAS_RAWTORGB24ROW_NEON)
ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
#endif
#if defined(HAS_RAWTORGB24ROW_MSA)
ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
#endif
+#if defined(HAS_RAWTORGB24ROW_MMI)
+ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
+#endif
#ifdef HAS_ARGBTOYROW_AVX2
ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
#endif
@@ -474,57 +608,87 @@
#ifdef HAS_ARGBTOYROW_MSA
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBTOYROW_MMI
+ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBTOYJROW_MMI
+ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_BGRATOYROW_MSA
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_BGRATOYROW_MMI
+ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ABGRTOYROW_MSA
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
#endif
+#ifdef HAS_ABGRTOYROW_MMI
+ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_RGBATOYROW_MSA
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
#endif
+#ifdef HAS_RGBATOYROW_MMI
+ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7)
+#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
#endif
#ifdef HAS_RGB24TOYROW_MSA
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
#endif
+#ifdef HAS_RGB24TOYROW_MMI
+ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7)
+#endif
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
#endif
#ifdef HAS_RAWTOYROW_MSA
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
#endif
+#ifdef HAS_RAWTOYROW_MMI
+ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7)
+#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_RGB565TOYROW_MSA
ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
#endif
+#ifdef HAS_RGB565TOYROW_MMI
+ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_ARGB1555TOYROW_NEON
ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
#endif
#ifdef HAS_ARGB1555TOYROW_MSA
ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
#endif
+#ifdef HAS_ARGB1555TOYROW_MMI
+ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_ARGB4444TOYROW_NEON
ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
#endif
+#ifdef HAS_ARGB4444TOYROW_MMI
+ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7)
+#endif
#ifdef HAS_YUY2TOYROW_NEON
ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
#endif
@@ -534,66 +698,66 @@
#ifdef HAS_YUY2TOYROW_MSA
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_YUY2TOYROW_MMI
+ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7)
+#endif
#ifdef HAS_UYVYTOYROW_MSA
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_UYVYTOYROW_MMI
+ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
+#endif
#ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RGB24TOARGBROW_MSA
ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
#endif
+#ifdef HAS_RGB24TOARGBROW_MMI
+ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3)
+#endif
#ifdef HAS_RAWTOARGBROW_NEON
ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
#endif
#ifdef HAS_RAWTOARGBROW_MSA
ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
#endif
+#ifdef HAS_RAWTOARGBROW_MMI
+ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3)
+#endif
#ifdef HAS_RGB565TOARGBROW_NEON
ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_RGB565TOARGBROW_MSA
ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_RGB565TOARGBROW_MMI
+ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGB1555TOARGBROW_NEON
ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
#endif
#ifdef HAS_ARGB1555TOARGBROW_MSA
ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_ARGB1555TOARGBROW_MMI
+ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGB4444TOARGBROW_NEON
ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
#endif
-#ifdef HAS_RGB24TOARGBROW_DSPR2
-ANY11(RGB24ToARGBRow_Any_DSPR2, RGB24ToARGBRow_DSPR2, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RAWTOARGBROW_DSPR2
-ANY11(RAWToARGBRow_Any_DSPR2, RAWToARGBRow_DSPR2, 0, 3, 4, 7)
-#endif
-#ifdef HAS_RGB565TOARGBROW_DSPR2
-ANY11(RGB565ToARGBRow_Any_DSPR2, RGB565ToARGBRow_DSPR2, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB1555TOARGBROW_DSPR2
-ANY11(ARGB1555ToARGBRow_Any_DSPR2, ARGB1555ToARGBRow_DSPR2, 0, 2, 4, 7)
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_DSPR2
-ANY11(ARGB4444ToARGBRow_Any_DSPR2, ARGB4444ToARGBRow_DSPR2, 0, 2, 4, 7)
-#endif
-#ifdef HAS_BGRATOYROW_DSPR2
-ANY11(BGRAToYRow_Any_DSPR2, BGRAToYRow_DSPR2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ARGBTOYROW_DSPR2
-ANY11(ARGBToYRow_Any_DSPR2, ARGBToYRow_DSPR2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_ABGRTOYROW_DSPR2
-ANY11(ABGRToYRow_Any_DSPR2, ABGRToYRow_DSPR2, 0, 4, 1, 7)
-#endif
-#ifdef HAS_RGBATOYROW_DSPR2
-ANY11(RGBAToYRow_Any_DSPR2, RGBAToYRow_DSPR2, 0, 4, 1, 7)
-#endif
#ifdef HAS_ARGB4444TOARGBROW_MSA
ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
#endif
+#ifdef HAS_ARGB4444TOARGBROW_MMI
+ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3)
+#endif
#ifdef HAS_ARGBATTENUATEROW_SSSE3
ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
#endif
@@ -612,21 +776,30 @@
#ifdef HAS_ARGBATTENUATEROW_MSA
ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
#endif
+#ifdef HAS_ARGBATTENUATEROW_MMI
+ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1)
+#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
-ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32)
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_ARGBEXTRACTALPHAROW_NEON
ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MMI
+ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7)
+#endif
#undef ANY11
// Any 1 to 1 blended. Destination is read, modify, write.
#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@@ -645,33 +818,39 @@
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7)
#endif
+#ifdef HAS_ARGBCOPYALPHAROW_MMI
+ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1)
+#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15)
#endif
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
#endif
+#ifdef HAS_ARGBCOPYYTOALPHAROW_MMI
+ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7)
+#endif
#undef ANY11B
// Any 1 to 1 with parameter.
-#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, T param, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 2]); \
- memset(temp, 0, 64); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, param, n); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp, temp + 64, param, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
+#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
+ memset(temp, 0, 64); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp, temp + 64, param, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
}
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
ARGBToRGB565DitherRow_SSE2,
- const uint32,
+ const uint32_t,
4,
2,
3)
@@ -679,7 +858,7 @@
#if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
ARGBToRGB565DitherRow_AVX2,
- const uint32,
+ const uint32_t,
4,
2,
7)
@@ -687,7 +866,7 @@
#if defined(HAS_ARGBTORGB565DITHERROW_NEON)
ANY11P(ARGBToRGB565DitherRow_Any_NEON,
ARGBToRGB565DitherRow_NEON,
- const uint32,
+ const uint32_t,
4,
2,
7)
@@ -695,64 +874,146 @@
#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
ANY11P(ARGBToRGB565DitherRow_Any_MSA,
ARGBToRGB565DitherRow_MSA,
- const uint32,
+ const uint32_t,
4,
2,
7)
#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+#if defined(HAS_ARGBTORGB565DITHERROW_MMI)
+ANY11P(ARGBToRGB565DitherRow_Any_MMI,
+ ARGBToRGB565DitherRow_MMI,
+ const uint32_t,
+ 4,
+ 2,
+ 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
#endif
#ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
#endif
#ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
#endif
#ifdef HAS_ARGBSHUFFLEROW_MSA
-ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
#endif
+#ifdef HAS_ARGBSHUFFLEROW_MMI
+ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1)
+#endif
+#undef ANY11P
#undef ANY11P
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
-#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
- void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
- SIMD_ALIGNED(uint16 temp[16 * 2]); \
- memset(temp, 0, 32); /* for msan */ \
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
+ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+ SIMD_ALIGNED(STYPE temp[32]); \
+ SIMD_ALIGNED(DTYPE out[32]); \
+ memset(temp, 0, 32 * SBPP); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
- ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ ANY_SIMD(src_ptr, dst_ptr, scale, n); \
} \
memcpy(temp, src_ptr + n, r * SBPP); \
- ANY_SIMD(temp, temp + 16, param, MASK + 1); \
- memcpy(dst_ptr + n, temp + 16, r * BPP); \
+ ANY_SIMD(temp, out, scale, MASK + 1); \
+ memcpy(dst_ptr + n, out, r * BPP); \
+ }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+ Convert16To8Row_SSSE3,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+ Convert16To8Row_AVX2,
+ 2,
+ 1,
+ uint16_t,
+ uint8_t,
+ 31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+ Convert8To16Row_SSE2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+ Convert8To16Row_AVX2,
+ 1,
+ 2,
+ uint8_t,
+ uint16_t,
+ 31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
+ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+ SIMD_ALIGNED(ST temp[32]); \
+ SIMD_ALIGNED(T out[32]); \
+ memset(temp, 0, SBPP * 32); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, param, n); \
+ } \
+ memcpy(temp, src_ptr + n, r * SBPP); \
+ ANY_SIMD(temp, out, param, MASK + 1); \
+ memcpy(dst_ptr + n, out, r * BPP); \
}
#ifdef HAS_HALFFLOATROW_SSE2
-ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7)
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
#endif
#ifdef HAS_HALFFLOATROW_AVX2
-ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15)
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
#endif
#ifdef HAS_HALFFLOATROW_F16C
-ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+ HalfFloat1Row_F16C,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+ HalfFloat1Row_NEON,
+ uint16_t,
+ uint16_t,
+ 2,
+ 2,
+ 7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
#endif
#undef ANY11P16
// Any 1 to 1 with yuvconstants
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 2]); \
+ SIMD_ALIGNED(uint8_t temp[128 * 2]); \
memset(temp, 0, 128); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@@ -782,20 +1043,20 @@
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, \
- int width, int source_y_fraction) { \
- SIMD_ALIGNED(uint8 temp[64 * 3]); \
- memset(temp, 0, 64 * 2); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
- } \
- memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
- memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
- ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
- memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \
+ ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 3]); \
+ memset(temp, 0, 64 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
+ } \
+ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
+ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
+ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
@@ -807,18 +1068,18 @@
#ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif
-#ifdef HAS_INTERPOLATEROW_DSPR2
-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
-#endif
#ifdef HAS_INTERPOLATEROW_MSA
ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
#endif
+#ifdef HAS_INTERPOLATEROW_MMI
+ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7)
+#endif
#undef ANY11T
// Any 1 to 1 mirror.
#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \
- SIMD_ALIGNED(uint8 temp[64 * 2]); \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64 * 2]); \
memset(temp, 0, 64); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@@ -842,6 +1103,9 @@
#ifdef HAS_MIRRORROW_MSA
ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#endif
+#ifdef HAS_MIRRORROW_MMI
+ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
+#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@@ -854,49 +1118,53 @@
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
#endif
+#ifdef HAS_ARGBMIRRORROW_MMI
+ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1)
+#endif
#undef ANY11M
// Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, T v32, int width) { \
- SIMD_ALIGNED(uint8 temp[64]); \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(dst_ptr, v32, n); \
- } \
- ANY_SIMD(temp, v32, MASK + 1); \
- memcpy(dst_ptr + n * BPP, temp, r * BPP); \
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+ SIMD_ALIGNED(uint8_t temp[64]); \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(dst_ptr, v32, n); \
+ } \
+ ANY_SIMD(temp, v32, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, temp, r * BPP); \
}
#ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
#endif
#ifdef HAS_ARGBSETROW_MSA
-ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32, 4, 3)
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
#endif
#undef ANY1
// Any 1 to 2. Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
- void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 3]); \
- memset(temp, 0, 128); /* for msan */ \
- int r = width & MASK; \
- int n = width & ~MASK; \
- if (n > 0) { \
- ANY_SIMD(src_ptr, dst_u, dst_v, n); \
- } \
- memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
- ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
- memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
- memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_u, dst_v, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
+ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \
+ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \
}
#ifdef HAS_SPLITUVROW_SSE2
@@ -908,8 +1176,11 @@
#ifdef HAS_SPLITUVROW_NEON
ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
#endif
-#ifdef HAS_SPLITUVROW_DSPR2
-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
+#endif
+#ifdef HAS_SPLITUVROW_MMI
+ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7)
#endif
#ifdef HAS_ARGBTOUV444ROW_SSSE3
ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -932,14 +1203,47 @@
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
#endif
+#ifdef HAS_YUY2TOUV422ROW_MMI
+ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7)
+ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15)
+ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15)
+#endif
#undef ANY12
+// Any 1 to 3. Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
+ uint8_t* dst_b, int width) { \
+ SIMD_ALIGNED(uint8_t temp[16 * 6]); \
+ memset(temp, 0, 16 * 3); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \
+ } \
+ memcpy(temp, src_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+ memcpy(dst_r + n, temp + 16 * 3, r); \
+ memcpy(dst_g + n, temp + 16 * 4, r); \
+ memcpy(dst_b + n, temp + 16 * 5, r); \
+ }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_MMI
+ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
+#endif
+
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, int src_stride_ptr, uint8* dst_u, \
- uint8* dst_v, int width) { \
- SIMD_ALIGNED(uint8 temp[128 * 4]); \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \
+ uint8_t* dst_v, int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
@@ -987,83 +1291,138 @@
#ifdef HAS_ARGBTOUVROW_MSA
ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_ARGBTOUVROW_MMI
+ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_ARGBTOUVJROW_NEON
ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_ARGBTOUVJROW_MMI
+ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_BGRATOUVROW_NEON
ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_BGRATOUVROW_MSA
ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_BGRATOUVROW_MMI
+ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_ABGRTOUVROW_NEON
ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_ABGRTOUVROW_MMI
+ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_RGBATOUVROW_NEON
ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_RGBATOUVROW_MSA
ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
#endif
+#ifdef HAS_RGBATOUVROW_MMI
+ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
+#endif
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RGB24TOUVROW_MSA
ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
#endif
+#ifdef HAS_RGB24TOUVROW_MMI
+ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
+#endif
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_MSA
ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
#endif
+#ifdef HAS_RAWTOUVROW_MMI
+ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15)
+#endif
#ifdef HAS_RGB565TOUVROW_NEON
ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_RGB565TOUVROW_MSA
ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
#endif
+#ifdef HAS_RGB565TOUVROW_MMI
+ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_ARGB1555TOUVROW_NEON
ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
#endif
#ifdef HAS_ARGB1555TOUVROW_MSA
ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
#endif
+#ifdef HAS_ARGB1555TOUVROW_MMI
+ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_ARGB4444TOUVROW_NEON
ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
#endif
+#ifdef HAS_ARGB4444TOUVROW_MMI
+ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15)
+#endif
#ifdef HAS_YUY2TOUVROW_NEON
ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
#endif
#ifdef HAS_UYVYTOUVROW_NEON
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
#endif
-#ifdef HAS_BGRATOUVROW_DSPR2
-ANY12S(BGRAToUVRow_Any_DSPR2, BGRAToUVRow_DSPR2, 0, 4, 15)
-#endif
-#ifdef HAS_ABGRTOUVROW_DSPR2
-ANY12S(ABGRToUVRow_Any_DSPR2, ABGRToUVRow_DSPR2, 0, 4, 15)
-#endif
-#ifdef HAS_RGBATOUVROW_DSPR2
-ANY12S(RGBAToUVRow_Any_DSPR2, RGBAToUVRow_DSPR2, 0, 4, 15)
-#endif
-#ifdef HAS_ARGBTOUVROW_DSPR2
-ANY12S(ARGBToUVRow_Any_DSPR2, ARGBToUVRow_DSPR2, 0, 4, 15)
-#endif
#ifdef HAS_YUY2TOUVROW_MSA
ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
#endif
+#ifdef HAS_YUY2TOUVROW_MMI
+ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15)
+#endif
#ifdef HAS_UYVYTOUVROW_MSA
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
#endif
+#ifdef HAS_UYVYTOUVROW_MMI
+ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
+#endif
#undef ANY12S
+// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
+ int width) { \
+ SIMD_ALIGNED(uint8_t temp[128 * 3]); \
+ memset(temp, 0, 128 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
+ } \
+ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+ memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
+ SS(r, UVSHIFT) * BPP); \
+ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+ BPP); \
+ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \
+ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \
+ } \
+ ANY_SIMD(temp, 128, temp + 256, MASK + 1); \
+ memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \
+ }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/row_common.cc b/files/source/row_common.cc
index bf953ee..8951d00 100644
--- a/files/source/row_common.cc
+++ b/files/source/row_common.cc
@@ -10,6 +10,7 @@
#include "libyuv/row.h"
+#include <stdio.h>
#include <string.h> // For memcpy and memset.
#include "libyuv/basic_types.h"
@@ -23,59 +24,69 @@
#define USE_BRANCHLESS 1
#if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+static __inline int32_t clamp0(int32_t v) {
return ((-(v) >> 31) & (v));
}
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
return (((255 - (v)) >> 31) | (v)) & 255;
}
-static __inline uint32 Clamp(int32 val) {
- int v = clamp0(val);
- return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+ return (((1023 - (v)) >> 31) | (v)) & 1023;
}
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
int m = v >> 31;
return (v + m) ^ m;
}
#else // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+static __inline int32_t clamp0(int32_t v) {
return (v < 0) ? 0 : v;
}
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
return (v > 255) ? 255 : v;
}
-static __inline uint32 Clamp(int32 val) {
- int v = clamp0(val);
- return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+ return (v > 1023) ? 1023 : v;
}
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
return (v < 0) ? -v : v;
}
#endif // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+ int v = clamp0(val);
+ return (uint32_t)(clamp255(v));
+}
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
+static __inline uint32_t Clamp10(int32_t val) {
+ int v = clamp0(val);
+ return (uint32_t)(clamp1023(v));
+}
+
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
#else
-static inline void WRITEWORD(uint8* p, uint32 v) {
- p[0] = (uint8)(v & 255);
- p[1] = (uint8)((v >> 8) & 255);
- p[2] = (uint8)((v >> 16) & 255);
- p[3] = (uint8)((v >> 24) & 255);
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+ p[0] = (uint8_t)(v & 255);
+ p[1] = (uint8_t)((v >> 8) & 255);
+ p[2] = (uint8_t)((v >> 16) & 255);
+ p[3] = (uint8_t)((v >> 24) & 255);
}
#endif
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_rgb24[0];
- uint8 g = src_rgb24[1];
- uint8 r = src_rgb24[2];
+ uint8_t b = src_rgb24[0];
+ uint8_t g = src_rgb24[1];
+ uint8_t r = src_rgb24[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
@@ -85,12 +96,12 @@
}
}
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
@@ -100,12 +111,12 @@
}
}
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 r = src_raw[0];
- uint8 g = src_raw[1];
- uint8 b = src_raw[2];
+ uint8_t r = src_raw[0];
+ uint8_t g = src_raw[1];
+ uint8_t b = src_raw[2];
dst_rgb24[0] = b;
dst_rgb24[1] = g;
dst_rgb24[2] = r;
@@ -114,12 +125,14 @@
}
}
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_rgb565[0] & 0x1f;
- uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r = src_rgb565[1] >> 3;
+ uint8_t b = src_rgb565[0] & 0x1f;
+ uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r = src_rgb565[1] >> 3;
dst_argb[0] = (b << 3) | (b >> 2);
dst_argb[1] = (g << 2) | (g >> 4);
dst_argb[2] = (r << 3) | (r >> 2);
@@ -129,15 +142,15 @@
}
}
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb1555[0] & 0x1f;
- uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r = (src_argb1555[1] & 0x7c) >> 2;
- uint8 a = src_argb1555[1] >> 7;
+ uint8_t b = src_argb1555[0] & 0x1f;
+ uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t a = src_argb1555[1] >> 7;
dst_argb[0] = (b << 3) | (b >> 2);
dst_argb[1] = (g << 3) | (g >> 2);
dst_argb[2] = (r << 3) | (r >> 2);
@@ -147,15 +160,15 @@
}
}
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb4444[0] & 0x0f;
- uint8 g = src_argb4444[0] >> 4;
- uint8 r = src_argb4444[1] & 0x0f;
- uint8 a = src_argb4444[1] >> 4;
+ uint8_t b = src_argb4444[0] & 0x0f;
+ uint8_t g = src_argb4444[0] >> 4;
+ uint8_t r = src_argb4444[1] & 0x0f;
+ uint8_t a = src_argb4444[1] >> 4;
dst_argb[0] = (b << 4) | b;
dst_argb[1] = (g << 4) | g;
dst_argb[2] = (r << 4) | r;
@@ -165,12 +178,53 @@
}
}
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb[0];
- uint8 g = src_argb[1];
- uint8 r = src_argb[2];
+ uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t b = (ar30 >> 2) & 0xff;
+ uint32_t g = (ar30 >> 12) & 0xff;
+ uint32_t r = (ar30 >> 22) & 0xff;
+ uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
+ *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+ dst_argb += 4;
+ src_ar30 += 4;
+ }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t b = (ar30 >> 2) & 0xff;
+ uint32_t g = (ar30 >> 12) & 0xff;
+ uint32_t r = (ar30 >> 22) & 0xff;
+ uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
+ *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+ dst_abgr += 4;
+ src_ar30 += 4;
+ }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t ar30 = *(const uint32_t*)src_ar30;
+ uint32_t b = ar30 & 0x3ff;
+ uint32_t ga = ar30 & 0xc00ffc00;
+ uint32_t r = (ar30 >> 20) & 0x3ff;
+ *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+ dst_ab30 += 4;
+ src_ar30 += 4;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
dst_rgb[0] = b;
dst_rgb[1] = g;
dst_rgb[2] = r;
@@ -179,12 +233,12 @@
}
}
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb[0];
- uint8 g = src_argb[1];
- uint8 r = src_argb[2];
+ uint8_t b = src_argb[0];
+ uint8_t g = src_argb[1];
+ uint8_t r = src_argb[2];
dst_rgb[0] = r;
dst_rgb[1] = g;
dst_rgb[2] = b;
@@ -193,25 +247,25 @@
}
}
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 2;
- uint8 r0 = src_argb[2] >> 3;
- uint8 b1 = src_argb[4] >> 3;
- uint8 g1 = src_argb[5] >> 2;
- uint8 r1 = src_argb[6] >> 3;
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 2;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t b1 = src_argb[4] >> 3;
+ uint8_t g1 = src_argb[5] >> 2;
+ uint8_t r1 = src_argb[6] >> 3;
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 2;
- uint8 r0 = src_argb[2] >> 3;
- *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 2;
+ uint8_t r0 = src_argb[2] >> 3;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
}
}
@@ -223,20 +277,20 @@
// endian will not affect order of the original matrix. But the dither4
// will containing the first pixel in the lower byte for little endian
// or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
int dither0 = ((const unsigned char*)(&dither4))[x & 3];
int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
- uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
- uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
- uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
- uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
- uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
- uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
+ uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+ uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+ uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+ uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
(r1 << 27));
dst_rgb += 4;
@@ -244,112 +298,138 @@
}
if (width & 1) {
int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
- uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
- uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
- uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
- *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+ uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+ uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
}
}
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 3;
- uint8 r0 = src_argb[2] >> 3;
- uint8 a0 = src_argb[3] >> 7;
- uint8 b1 = src_argb[4] >> 3;
- uint8 g1 = src_argb[5] >> 3;
- uint8 r1 = src_argb[6] >> 3;
- uint8 a1 = src_argb[7] >> 7;
- *(uint32*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
- (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 3;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t a0 = src_argb[3] >> 7;
+ uint8_t b1 = src_argb[4] >> 3;
+ uint8_t g1 = src_argb[5] >> 3;
+ uint8_t r1 = src_argb[6] >> 3;
+ uint8_t a1 = src_argb[7] >> 7;
+ *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
- uint8 b0 = src_argb[0] >> 3;
- uint8 g0 = src_argb[1] >> 3;
- uint8 r0 = src_argb[2] >> 3;
- uint8 a0 = src_argb[3] >> 7;
- *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ uint8_t b0 = src_argb[0] >> 3;
+ uint8_t g0 = src_argb[1] >> 3;
+ uint8_t r0 = src_argb[2] >> 3;
+ uint8_t a0 = src_argb[3] >> 7;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
}
}
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb[0] >> 4;
- uint8 g0 = src_argb[1] >> 4;
- uint8 r0 = src_argb[2] >> 4;
- uint8 a0 = src_argb[3] >> 4;
- uint8 b1 = src_argb[4] >> 4;
- uint8 g1 = src_argb[5] >> 4;
- uint8 r1 = src_argb[6] >> 4;
- uint8 a1 = src_argb[7] >> 4;
- *(uint32*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) | (b1 << 16) |
- (g1 << 20) | (r1 << 24) | (a1 << 28);
+ uint8_t b0 = src_argb[0] >> 4;
+ uint8_t g0 = src_argb[1] >> 4;
+ uint8_t r0 = src_argb[2] >> 4;
+ uint8_t a0 = src_argb[3] >> 4;
+ uint8_t b1 = src_argb[4] >> 4;
+ uint8_t g1 = src_argb[5] >> 4;
+ uint8_t r1 = src_argb[6] >> 4;
+ uint8_t a1 = src_argb[7] >> 4;
+ *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
dst_rgb += 4;
src_argb += 8;
}
if (width & 1) {
- uint8 b0 = src_argb[0] >> 4;
- uint8 g0 = src_argb[1] >> 4;
- uint8 r0 = src_argb[2] >> 4;
- uint8 a0 = src_argb[3] >> 4;
- *(uint16*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ uint8_t b0 = src_argb[0] >> 4;
+ uint8_t g0 = src_argb[1] >> 4;
+ uint8_t r0 = src_argb[2] >> 4;
+ uint8_t a0 = src_argb[3] >> 4;
+ *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
}
}
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+ uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+ uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+ uint32_t a0 = (src_abgr[3] >> 6);
+ *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+ dst_ar30 += 4;
+ src_abgr += 4;
+ }
+}
+
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+ uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+ uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+ uint32_t a0 = (src_argb[3] >> 6);
+ *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+ dst_ar30 += 4;
+ src_argb += 4;
+ }
+}
+
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
}
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
// ARGBToY_C and ARGBToUV_C
-#define MAKEROWY(NAME, R, G, B, BPP) \
- void NAME##ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
- src_rgb1[B + BPP]) >> \
- 2; \
- uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
- src_rgb1[G + BPP]) >> \
- 2; \
- uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
- src_rgb1[R + BPP]) >> \
- 2; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
- uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
- uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
- dst_u[0] = RGBToU(ar, ag, ab); \
- dst_v[0] = RGBToV(ar, ag, ab); \
- } \
+#define MAKEROWY(NAME, R, G, B, BPP) \
+ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
+ src_rgb1[B + BPP]) >> \
+ 2; \
+ uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
+ src_rgb1[G + BPP]) >> \
+ 2; \
+ uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
+ src_rgb1[R + BPP]) >> \
+ 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
}
MAKEROWY(ARGB, 2, 1, 0, 4)
@@ -385,65 +465,65 @@
// g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (38 * r + 75 * g + 15 * b + 64) >> 7;
}
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
}
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARGBToYJ_C and ARGBToUVJ_C
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
- void NAME##ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
- int x; \
- for (x = 0; x < width; ++x) { \
- dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
- src_argb0 += BPP; \
- dst_y += 1; \
- } \
- } \
- void NAME##ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
- uint8* dst_u, uint8* dst_v, int width) { \
- const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
- int x; \
- for (x = 0; x < width - 1; x += 2) { \
- uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
- AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
- uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
- AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
- uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
- AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- src_rgb0 += BPP * 2; \
- src_rgb1 += BPP * 2; \
- dst_u += 1; \
- dst_v += 1; \
- } \
- if (width & 1) { \
- uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
- uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
- uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
- dst_u[0] = RGBToUJ(ar, ag, ab); \
- dst_v[0] = RGBToVJ(ar, ag, ab); \
- } \
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+ int x; \
+ for (x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+ } \
+ void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
+ uint8_t* dst_u, uint8_t* dst_v, int width) { \
+ const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ int x; \
+ for (x = 0; x < width - 1; x += 2) { \
+ uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
}
MAKEROWYJ(ARGB, 2, 1, 0, 4)
#undef MAKEROWYJ
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_rgb565[0] & 0x1f;
- uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r = src_rgb565[1] >> 3;
+ uint8_t b = src_rgb565[0] & 0x1f;
+ uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r = src_rgb565[1] >> 3;
b = (b << 3) | (b >> 2);
g = (g << 2) | (g >> 4);
r = (r << 3) | (r >> 2);
@@ -453,12 +533,12 @@
}
}
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb1555[0] & 0x1f;
- uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t b = src_argb1555[0] & 0x1f;
+ uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
b = (b << 3) | (b >> 2);
g = (g << 3) | (g >> 2);
r = (r << 3) | (r >> 2);
@@ -468,12 +548,12 @@
}
}
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 b = src_argb4444[0] & 0x0f;
- uint8 g = src_argb4444[0] >> 4;
- uint8 r = src_argb4444[1] & 0x0f;
+ uint8_t b = src_argb4444[0] & 0x0f;
+ uint8_t g = src_argb4444[0] >> 4;
+ uint8_t r = src_argb4444[1] & 0x0f;
b = (b << 4) | b;
g = (g << 4) | g;
r = (r << 4) | r;
@@ -483,29 +563,29 @@
}
}
-void RGB565ToUVRow_C(const uint8* src_rgb565,
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_rgb565[0] & 0x1f;
- uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r0 = src_rgb565[1] >> 3;
- uint8 b1 = src_rgb565[2] & 0x1f;
- uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
- uint8 r1 = src_rgb565[3] >> 3;
- uint8 b2 = next_rgb565[0] & 0x1f;
- uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8 r2 = next_rgb565[1] >> 3;
- uint8 b3 = next_rgb565[2] & 0x1f;
- uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
- uint8 r3 = next_rgb565[3] >> 3;
- uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
+ uint8_t b0 = src_rgb565[0] & 0x1f;
+ uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r0 = src_rgb565[1] >> 3;
+ uint8_t b1 = src_rgb565[2] & 0x1f;
+ uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8_t r1 = src_rgb565[3] >> 3;
+ uint8_t b2 = next_rgb565[0] & 0x1f;
+ uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8_t r2 = next_rgb565[1] >> 3;
+ uint8_t b3 = next_rgb565[2] & 0x1f;
+ uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8_t r3 = next_rgb565[3] >> 3;
+ uint8_t b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
+ uint8_t g = (g0 + g1 + g2 + g3);
+ uint8_t r = (r0 + r1 + r2 + r3);
b = (b << 1) | (b >> 6); // 787 -> 888.
r = (r << 1) | (r >> 6);
dst_u[0] = RGBToU(r, g, b);
@@ -516,15 +596,15 @@
dst_v += 1;
}
if (width & 1) {
- uint8 b0 = src_rgb565[0] & 0x1f;
- uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
- uint8 r0 = src_rgb565[1] >> 3;
- uint8 b2 = next_rgb565[0] & 0x1f;
- uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
- uint8 r2 = next_rgb565[1] >> 3;
- uint8 b = (b0 + b2); // 565 * 2 = 676.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
+ uint8_t b0 = src_rgb565[0] & 0x1f;
+ uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8_t r0 = src_rgb565[1] >> 3;
+ uint8_t b2 = next_rgb565[0] & 0x1f;
+ uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8_t r2 = next_rgb565[1] >> 3;
+ uint8_t b = (b0 + b2); // 565 * 2 = 676.
+ uint8_t g = (g0 + g2);
+ uint8_t r = (r0 + r2);
b = (b << 2) | (b >> 4); // 676 -> 888
g = (g << 1) | (g >> 6);
r = (r << 2) | (r >> 4);
@@ -533,29 +613,29 @@
}
}
-void ARGB1555ToUVRow_C(const uint8* src_argb1555,
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb1555[0] & 0x1f;
- uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8 b1 = src_argb1555[2] & 0x1f;
- uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
- uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
- uint8 b2 = next_argb1555[0] & 0x1f;
- uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
- uint8 b3 = next_argb1555[2] & 0x1f;
- uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
- uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
- uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
+ uint8_t b0 = src_argb1555[0] & 0x1f;
+ uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t b1 = src_argb1555[2] & 0x1f;
+ uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8_t b2 = next_argb1555[0] & 0x1f;
+ uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8_t b3 = next_argb1555[2] & 0x1f;
+ uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+ uint8_t b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
+ uint8_t g = (g0 + g1 + g2 + g3);
+ uint8_t r = (r0 + r1 + r2 + r3);
b = (b << 1) | (b >> 6); // 777 -> 888.
g = (g << 1) | (g >> 6);
r = (r << 1) | (r >> 6);
@@ -567,15 +647,15 @@
dst_v += 1;
}
if (width & 1) {
- uint8 b0 = src_argb1555[0] & 0x1f;
- uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
- uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
- uint8 b2 = next_argb1555[0] & 0x1f;
- uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
- uint8 r2 = next_argb1555[1] >> 3;
- uint8 b = (b0 + b2); // 555 * 2 = 666.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
+ uint8_t b0 = src_argb1555[0] & 0x1f;
+ uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8_t b2 = next_argb1555[0] & 0x1f;
+ uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8_t r2 = next_argb1555[1] >> 3;
+ uint8_t b = (b0 + b2); // 555 * 2 = 666.
+ uint8_t g = (g0 + g2);
+ uint8_t r = (r0 + r2);
b = (b << 2) | (b >> 4); // 666 -> 888.
g = (g << 2) | (g >> 4);
r = (r << 2) | (r >> 4);
@@ -584,29 +664,29 @@
}
}
-void ARGB4444ToUVRow_C(const uint8* src_argb4444,
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 b0 = src_argb4444[0] & 0x0f;
- uint8 g0 = src_argb4444[0] >> 4;
- uint8 r0 = src_argb4444[1] & 0x0f;
- uint8 b1 = src_argb4444[2] & 0x0f;
- uint8 g1 = src_argb4444[2] >> 4;
- uint8 r1 = src_argb4444[3] & 0x0f;
- uint8 b2 = next_argb4444[0] & 0x0f;
- uint8 g2 = next_argb4444[0] >> 4;
- uint8 r2 = next_argb4444[1] & 0x0f;
- uint8 b3 = next_argb4444[2] & 0x0f;
- uint8 g3 = next_argb4444[2] >> 4;
- uint8 r3 = next_argb4444[3] & 0x0f;
- uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
- uint8 g = (g0 + g1 + g2 + g3);
- uint8 r = (r0 + r1 + r2 + r3);
+ uint8_t b0 = src_argb4444[0] & 0x0f;
+ uint8_t g0 = src_argb4444[0] >> 4;
+ uint8_t r0 = src_argb4444[1] & 0x0f;
+ uint8_t b1 = src_argb4444[2] & 0x0f;
+ uint8_t g1 = src_argb4444[2] >> 4;
+ uint8_t r1 = src_argb4444[3] & 0x0f;
+ uint8_t b2 = next_argb4444[0] & 0x0f;
+ uint8_t g2 = next_argb4444[0] >> 4;
+ uint8_t r2 = next_argb4444[1] & 0x0f;
+ uint8_t b3 = next_argb4444[2] & 0x0f;
+ uint8_t g3 = next_argb4444[2] >> 4;
+ uint8_t r3 = next_argb4444[3] & 0x0f;
+ uint8_t b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
+ uint8_t g = (g0 + g1 + g2 + g3);
+ uint8_t r = (r0 + r1 + r2 + r3);
b = (b << 2) | (b >> 4); // 666 -> 888.
g = (g << 2) | (g >> 4);
r = (r << 2) | (r >> 4);
@@ -618,15 +698,15 @@
dst_v += 1;
}
if (width & 1) {
- uint8 b0 = src_argb4444[0] & 0x0f;
- uint8 g0 = src_argb4444[0] >> 4;
- uint8 r0 = src_argb4444[1] & 0x0f;
- uint8 b2 = next_argb4444[0] & 0x0f;
- uint8 g2 = next_argb4444[0] >> 4;
- uint8 r2 = next_argb4444[1] & 0x0f;
- uint8 b = (b0 + b2); // 444 * 2 = 555.
- uint8 g = (g0 + g2);
- uint8 r = (r0 + r2);
+ uint8_t b0 = src_argb4444[0] & 0x0f;
+ uint8_t g0 = src_argb4444[0] >> 4;
+ uint8_t r0 = src_argb4444[1] & 0x0f;
+ uint8_t b2 = next_argb4444[0] & 0x0f;
+ uint8_t g2 = next_argb4444[0] >> 4;
+ uint8_t r2 = next_argb4444[1] & 0x0f;
+ uint8_t b = (b0 + b2); // 444 * 2 = 555.
+ uint8_t g = (g0 + g2);
+ uint8_t r = (r0 + r2);
b = (b << 3) | (b >> 2); // 555 -> 888.
g = (g << 3) | (g >> 2);
r = (r << 3) | (r >> 2);
@@ -635,15 +715,15 @@
}
}
-void ARGBToUV444Row_C(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 ab = src_argb[0];
- uint8 ag = src_argb[1];
- uint8 ar = src_argb[2];
+ uint8_t ab = src_argb[0];
+ uint8_t ag = src_argb[1];
+ uint8_t ar = src_argb[2];
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb += 4;
@@ -652,10 +732,10 @@
}
}
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
- uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+ uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = src_argb[3];
dst_argb += 4;
@@ -664,7 +744,7 @@
}
// Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
int b = dst_argb[0];
@@ -683,9 +763,9 @@
// Apply color matrix to a row of image. Matrix is signed.
// TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -715,7 +795,9 @@
}
// Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
int b = dst_argb[0];
@@ -731,7 +813,9 @@
}
// Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_C(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
int x;
for (x = 0; x < width; ++x) {
int b = dst_argb[0];
@@ -744,7 +828,7 @@
}
}
-void ARGBQuantizeRow_C(uint8* dst_argb,
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
@@ -764,21 +848,21 @@
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v* f >> 24
-void ARGBShadeRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
- const uint32 b_scale = REPEAT8(value & 0xff);
- const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
- const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
- const uint32 a_scale = REPEAT8(value >> 24);
+ uint32_t value) {
+ const uint32_t b_scale = REPEAT8(value & 0xff);
+ const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32_t a_scale = REPEAT8(value >> 24);
int i;
for (i = 0; i < width; ++i) {
- const uint32 b = REPEAT8(src_argb[0]);
- const uint32 g = REPEAT8(src_argb[1]);
- const uint32 r = REPEAT8(src_argb[2]);
- const uint32 a = REPEAT8(src_argb[3]);
+ const uint32_t b = REPEAT8(src_argb[0]);
+ const uint32_t g = REPEAT8(src_argb[1]);
+ const uint32_t r = REPEAT8(src_argb[2]);
+ const uint32_t a = REPEAT8(src_argb[3]);
dst_argb[0] = SHADE(b, b_scale);
dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale);
@@ -793,20 +877,20 @@
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v* f >> 16
-void ARGBMultiplyRow_C(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
- const uint32 b = REPEAT8(src_argb0[0]);
- const uint32 g = REPEAT8(src_argb0[1]);
- const uint32 r = REPEAT8(src_argb0[2]);
- const uint32 a = REPEAT8(src_argb0[3]);
- const uint32 b_scale = src_argb1[0];
- const uint32 g_scale = src_argb1[1];
- const uint32 r_scale = src_argb1[2];
- const uint32 a_scale = src_argb1[3];
+ const uint32_t b = REPEAT8(src_argb0[0]);
+ const uint32_t g = REPEAT8(src_argb0[1]);
+ const uint32_t r = REPEAT8(src_argb0[2]);
+ const uint32_t a = REPEAT8(src_argb0[3]);
+ const uint32_t b_scale = src_argb1[0];
+ const uint32_t g_scale = src_argb1[1];
+ const uint32_t r_scale = src_argb1[2];
+ const uint32_t a_scale = src_argb1[3];
dst_argb[0] = SHADE(b, b_scale);
dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale);
@@ -821,9 +905,9 @@
#define SHADE(f, v) clamp255(v + f)
-void ARGBAddRow_C(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -848,9 +932,9 @@
#define SHADE(f, v) clamp0(f - v)
-void ARGBSubtractRow_C(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -874,10 +958,10 @@
#undef SHADE
// Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -891,13 +975,13 @@
int b_diff = b - b_sub;
int c_diff = c - c_sub;
int sobel = Abs(a_diff + b_diff * 2 + c_diff);
- dst_sobelx[i] = (uint8)(clamp255(sobel));
+ dst_sobelx[i] = (uint8_t)(clamp255(sobel));
}
}
-void SobelYRow_C(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_C(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
int i;
for (i = 0; i < width; ++i) {
@@ -911,62 +995,62 @@
int b_diff = b - b_sub;
int c_diff = c - c_sub;
int sobel = Abs(a_diff + b_diff * 2 + c_diff);
- dst_sobely[i] = (uint8)(clamp255(sobel));
+ dst_sobely[i] = (uint8_t)(clamp255(sobel));
}
}
-void SobelRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
int b = src_sobely[i];
int s = clamp255(r + b);
- dst_argb[0] = (uint8)(s);
- dst_argb[1] = (uint8)(s);
- dst_argb[2] = (uint8)(s);
- dst_argb[3] = (uint8)(255u);
+ dst_argb[0] = (uint8_t)(s);
+ dst_argb[1] = (uint8_t)(s);
+ dst_argb[2] = (uint8_t)(s);
+ dst_argb[3] = (uint8_t)(255u);
dst_argb += 4;
}
}
-void SobelToPlaneRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
int b = src_sobely[i];
int s = clamp255(r + b);
- dst_y[i] = (uint8)(s);
+ dst_y[i] = (uint8_t)(s);
}
}
-void SobelXYRow_C(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_C(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
int r = src_sobelx[i];
int b = src_sobely[i];
int g = clamp255(r + b);
- dst_argb[0] = (uint8)(b);
- dst_argb[1] = (uint8)(g);
- dst_argb[2] = (uint8)(r);
- dst_argb[3] = (uint8)(255u);
+ dst_argb[0] = (uint8_t)(b);
+ dst_argb[1] = (uint8_t)(g);
+ dst_argb[2] = (uint8_t)(r);
+ dst_argb[3] = (uint8_t)(255u);
dst_argb += 4;
}
}
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// Copy a Y to RGB.
int x;
for (x = 0; x < width; ++x) {
- uint8 y = src_y[0];
+ uint8_t y = src_y[0];
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
dst_argb[3] = 255u;
dst_argb += 4;
@@ -1223,12 +1307,14 @@
#undef YG
// C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y,
- uint8 u,
- uint8 v,
- uint8* b,
- uint8* g,
- uint8* r,
+// Reads 8 bit YUV and leaves result as 16 bit.
+
+static __inline void YuvPixel(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
@@ -1259,10 +1345,117 @@
int yg = yuvconstants->kYToRgb[0];
#endif
- uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
- *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
- *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
- *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+ *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+ *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+ uint8_t u,
+ uint8_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+ *b = (int)(-(u * ub) + y1 + bb);
+ *g = (int)(-(u * ug + v * vg) + y1 + bg);
+ *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+ int16_t u,
+ int16_t v,
+ int* b,
+ int* g,
+ int* r,
+ const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = -yuvconstants->kUVToRB[1];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+ int ub = -yuvconstants->kUVToRB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[4];
+ int vr = -yuvconstants->kUVToRB[4];
+ int bb = yuvconstants->kUVBiasBGR[0];
+ int bg = yuvconstants->kUVBiasBGR[1];
+ int br = yuvconstants->kUVBiasBGR[2];
+ int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+ int ub = yuvconstants->kUVToB[0];
+ int ug = yuvconstants->kUVToG[0];
+ int vg = yuvconstants->kUVToG[1];
+ int vr = yuvconstants->kUVToR[1];
+ int bb = yuvconstants->kUVBiasB[0];
+ int bg = yuvconstants->kUVBiasG[0];
+ int br = yuvconstants->kUVBiasR[0];
+ int yg = yuvconstants->kYToRgb[0];
+#endif
+
+ uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+ u = clamp255(u >> 2);
+ v = clamp255(v >> 2);
+ *b = (int)(-(u * ub) + y1 + bb);
+ *g = (int)(-(u * ug + v * vg) + y1 + bg);
+ *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+ uint16_t u,
+ uint16_t v,
+ uint8_t* b,
+ uint8_t* g,
+ uint8_t* r,
+ const struct YuvConstants* yuvconstants) {
+ int b16;
+ int g16;
+ int r16;
+ YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+ *b = Clamp(b16 >> 6);
+ *g = Clamp(g16 >> 6);
+ *r = Clamp(r16 >> 6);
}
// Y contribution to R,G,B. Scale and bias.
@@ -1270,11 +1463,11 @@
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
- uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
- *b = Clamp((int32)(y1 + YGB) >> 6);
- *g = Clamp((int32)(y1 + YGB) >> 6);
- *r = Clamp((int32)(y1 + YGB) >> 6);
+static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
+ uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
+ *b = Clamp((int32_t)(y1 + YGB) >> 6);
+ *g = Clamp((int32_t)(y1 + YGB) >> 6);
+ *r = Clamp((int32_t)(y1 + YGB) >> 6);
}
#undef YG
@@ -1284,16 +1477,16 @@
(defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
// C mimic assembly.
// TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
- uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+ uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
yuvconstants);
rgb_buf[3] = 255;
@@ -1312,10 +1505,10 @@
}
}
#else
-void I444ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1332,10 +1525,10 @@
#endif
// Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1358,11 +1551,105 @@
}
}
-void I422AlphaToARGBRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* rgb_buf,
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+ rgb_buf + 6, yuvconstants);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ rgb_buf[3] = 255;
+ }
+}
+
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+ uint32_t ar30;
+ b = b >> 4; // convert 10.6 to 10 bit.
+ g = g >> 4;
+ r = r >> 4;
+ b = Clamp10(b);
+ g = Clamp10(g);
+ r = Clamp10(r);
+ ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+ (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+ const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ int b;
+ int g;
+ int r;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf + 4, b, g, r);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+ StoreAR30(rgb_buf, b, g, r);
+ }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1386,10 +1673,10 @@
}
}
-void I422ToRGB24Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1409,18 +1696,18 @@
}
}
-void I422ToARGB4444Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1431,8 +1718,8 @@
b1 = b1 >> 4;
g1 = g1 >> 4;
r1 = r1 >> 4;
- *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
- (g1 << 20) | (r1 << 24) | 0xf000f000;
+ *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+ (g1 << 20) | (r1 << 24) | 0xf000f000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1443,22 +1730,22 @@
b0 = b0 >> 4;
g0 = g0 >> 4;
r0 = r0 >> 4;
- *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
+ *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
}
}
-void I422ToARGB1555Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1469,8 +1756,8 @@
b1 = b1 >> 3;
g1 = g1 >> 3;
r1 = r1 >> 3;
- *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
- (g1 << 21) | (r1 << 26) | 0x80008000;
+ *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+ (g1 << 21) | (r1 << 26) | 0x80008000;
src_y += 2;
src_u += 1;
src_v += 1;
@@ -1481,22 +1768,22 @@
b0 = b0 >> 3;
g0 = g0 >> 3;
r0 = r0 >> 3;
- *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
+ *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
}
}
-void I422ToRGB565Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1507,7 +1794,7 @@
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) =
+ *(uint32_t*)(dst_rgb565) =
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2;
src_u += 1;
@@ -1519,13 +1806,13 @@
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
- *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
}
}
-void NV12ToARGBRow_C(const uint8* src_y,
- const uint8* src_uv,
- uint8* rgb_buf,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1547,9 +1834,9 @@
}
}
-void NV21ToARGBRow_C(const uint8* src_y,
- const uint8* src_vu,
- uint8* rgb_buf,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1571,17 +1858,59 @@
}
}
-void NV12ToRGB565Row_C(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_uv += 2;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* rgb_buf,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+ rgb_buf + 5, yuvconstants);
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+ rgb_buf + 2, yuvconstants);
+ }
+}
+
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- uint8 b0;
- uint8 g0;
- uint8 r0;
- uint8 b1;
- uint8 g1;
- uint8 r1;
+ uint8_t b0;
+ uint8_t g0;
+ uint8_t r0;
+ uint8_t b1;
+ uint8_t g1;
+ uint8_t r1;
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
@@ -1592,7 +1921,7 @@
b1 = b1 >> 3;
g1 = g1 >> 2;
r1 = r1 >> 3;
- *(uint32*)(dst_rgb565) =
+ *(uint32_t*)(dst_rgb565) =
b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
src_y += 2;
src_uv += 2;
@@ -1603,12 +1932,12 @@
b0 = b0 >> 3;
g0 = g0 >> 2;
r0 = r0 >> 3;
- *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
}
}
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
- uint8* rgb_buf,
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1629,8 +1958,8 @@
}
}
-void UYVYToARGBRow_C(const uint8* src_uyvy,
- uint8* rgb_buf,
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1651,10 +1980,10 @@
}
}
-void I422ToRGBARow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToRGBARow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -1677,7 +2006,7 @@
}
}
-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@@ -1693,7 +2022,7 @@
}
}
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
int x;
src += width - 1;
for (x = 0; x < width - 1; x += 2) {
@@ -1706,7 +2035,10 @@
}
}
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width - 1; x += 2) {
@@ -1722,10 +2054,10 @@
}
}
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
int x;
- const uint32* src32 = (const uint32*)(src);
- uint32* dst32 = (uint32*)(dst);
+ const uint32_t* src32 = (const uint32_t*)(src);
+ uint32_t* dst32 = (uint32_t*)(dst);
src32 += width - 1;
for (x = 0; x < width - 1; x += 2) {
dst32[x] = src32[0];
@@ -1737,7 +2069,10 @@
}
}
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void SplitUVRow_C(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
dst_u[x] = src_uv[0];
@@ -1752,9 +2087,9 @@
}
}
-void MergeUVRow_C(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_C(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -1770,20 +2105,110 @@
}
}
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
+void SplitRGBRow_C(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_r[x] = src_rgb[0];
+ dst_g[x] = src_rgb[1];
+ dst_b[x] = src_rgb[2];
+ src_rgb += 3;
+ }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_rgb[0] = src_r[x];
+ dst_rgb[1] = src_g[x];
+ dst_rgb[2] = src_b[x];
+ dst_rgb += 3;
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x] * scale;
+ dst_uv[1] = src_v[x] * scale;
+ dst_uv[2] = src_u[x + 1] * scale;
+ dst_uv[3] = src_v[x + 1] * scale;
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1] * scale;
+ dst_uv[1] = src_v[width - 1] * scale;
+ }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = src_y[x] * scale;
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+ }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ int x;
+ scale *= 0x0101; // replicates the byte.
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = (src_y[x] * scale) >> 16;
+ }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
memcpy(dst, src, count);
}
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
memcpy(dst, src, count * 2);
}
-void SetRow_C(uint8* dst, uint8 v8, int width) {
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
memset(dst, v8, width);
}
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
- uint32* d = (uint32*)(dst_argb);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
+ uint32_t* d = (uint32_t*)(dst_argb);
int x;
for (x = 0; x < width; ++x) {
d[x] = v32;
@@ -1791,10 +2216,10 @@
}
// Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2,
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
// Output a row of UV values, filtering 2 rows of YUY2.
int x;
@@ -1808,9 +2233,9 @@
}
// Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
// Output a row of UV values.
int x;
@@ -1824,7 +2249,7 @@
}
// Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
// Output a row of Y values.
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -1838,10 +2263,10 @@
}
// Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy,
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
// Output a row of UV values.
int x;
@@ -1855,9 +2280,9 @@
}
// Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
// Output a row of UV values.
int x;
@@ -1871,7 +2296,7 @@
}
// Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
// Output a row of Y values.
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -1889,19 +2314,19 @@
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
// This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
- uint32 fb = src_argb0[0];
- uint32 fg = src_argb0[1];
- uint32 fr = src_argb0[2];
- uint32 a = src_argb0[3];
- uint32 bb = src_argb1[0];
- uint32 bg = src_argb1[1];
- uint32 br = src_argb1[2];
+ uint32_t fb = src_argb0[0];
+ uint32_t fg = src_argb0[1];
+ uint32_t fr = src_argb0[2];
+ uint32_t a = src_argb0[3];
+ uint32_t bb = src_argb1[0];
+ uint32_t bg = src_argb1[1];
+ uint32_t br = src_argb1[2];
dst_argb[0] = BLEND(fb, bb, a);
dst_argb[1] = BLEND(fg, bg, a);
dst_argb[2] = BLEND(fr, br, a);
@@ -1924,13 +2349,13 @@
}
if (width & 1) {
- uint32 fb = src_argb0[0];
- uint32 fg = src_argb0[1];
- uint32 fr = src_argb0[2];
- uint32 a = src_argb0[3];
- uint32 bb = src_argb1[0];
- uint32 bg = src_argb1[1];
- uint32 br = src_argb1[2];
+ uint32_t fb = src_argb0[0];
+ uint32_t fg = src_argb0[1];
+ uint32_t fr = src_argb0[2];
+ uint32_t a = src_argb0[3];
+ uint32_t bb = src_argb1[0];
+ uint32_t bg = src_argb1[1];
+ uint32_t br = src_argb1[2];
dst_argb[0] = BLEND(fb, bb, a);
dst_argb[1] = BLEND(fg, bg, a);
dst_argb[2] = BLEND(fr, br, a);
@@ -1940,10 +2365,10 @@
#undef BLEND
#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_C(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -1964,13 +2389,13 @@
// Multiply source RGB by alpha and store to destination.
// This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
- uint32 b = src_argb[0];
- uint32 g = src_argb[1];
- uint32 r = src_argb[2];
- uint32 a = src_argb[3];
+ uint32_t b = src_argb[0];
+ uint32_t g = src_argb[1];
+ uint32_t r = src_argb[2];
+ uint32_t a = src_argb[3];
dst_argb[0] = ATTENUATE(b, a);
dst_argb[1] = ATTENUATE(g, a);
dst_argb[2] = ATTENUATE(r, a);
@@ -1988,10 +2413,10 @@
}
if (width & 1) {
- const uint32 b = src_argb[0];
- const uint32 g = src_argb[1];
- const uint32 r = src_argb[2];
- const uint32 a = src_argb[3];
+ const uint32_t b = src_argb[0];
+ const uint32_t g = src_argb[1];
+ const uint32_t r = src_argb[2];
+ const uint32_t a = src_argb[3];
dst_argb[0] = ATTENUATE(b, a);
dst_argb[1] = ATTENUATE(g, a);
dst_argb[2] = ATTENUATE(r, a);
@@ -2007,7 +2432,7 @@
// Reciprocal method is off by 1 on some values. ie 125
// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
#define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
+const uint32_t fixed_invtbl8[256] = {
0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06),
T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d),
T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14),
@@ -2047,14 +2472,16 @@
T(0xfc), T(0xfd), T(0xfe), 0x01000100};
#undef T
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
int i;
for (i = 0; i < width; ++i) {
- uint32 b = src_argb[0];
- uint32 g = src_argb[1];
- uint32 r = src_argb[2];
- const uint32 a = src_argb[3];
- const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ uint32_t b = src_argb[0];
+ uint32_t g = src_argb[1];
+ uint32_t r = src_argb[2];
+ const uint32_t a = src_argb[3];
+ const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
b = (b * ia) >> 8;
g = (g * ia) >> 8;
r = (r * ia) >> 8;
@@ -2068,11 +2495,11 @@
}
}
-void ComputeCumulativeSumRow_C(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width) {
- int32 row_sum[4] = {0, 0, 0, 0};
+ int32_t row_sum[4] = {0, 0, 0, 0};
int x;
for (x = 0; x < width; ++x) {
row_sum[0] += row[x * 4 + 0];
@@ -2086,19 +2513,19 @@
}
}
-void CumulativeSumToAverageRow_C(const int32* tl,
- const int32* bl,
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+ const int32_t* bl,
int w,
int area,
- uint8* dst,
+ uint8_t* dst,
int count) {
float ooa = 1.0f / area;
int i;
for (i = 0; i < count; ++i) {
- dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
- dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
- dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
- dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
dst += 4;
tl += 4;
bl += 4;
@@ -2107,9 +2534,9 @@
// Copy pixels from rotated source to destination row with a slope.
LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb,
+void ARGBAffineRow_C(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width) {
int i;
@@ -2120,8 +2547,8 @@
for (i = 0; i < width; ++i) {
int x = (int)(uv[0]);
int y = (int)(uv[1]);
- *(uint32*)(dst_argb) =
- *(const uint32*)(src_argb + y * src_argb_stride + x * 4);
+ *(uint32_t*)(dst_argb) =
+ *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
dst_argb += 4;
uv[0] += uv_dudv[2];
uv[1] += uv_dudv[3];
@@ -2129,9 +2556,9 @@
}
// Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv,
+static void HalfRow_C(const uint8_t* src_uv,
ptrdiff_t src_uv_stride,
- uint8* dst_uv,
+ uint8_t* dst_uv,
int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -2139,9 +2566,9 @@
}
}
-static void HalfRow_16_C(const uint16* src_uv,
+static void HalfRow_16_C(const uint16_t* src_uv,
ptrdiff_t src_uv_stride,
- uint16* dst_uv,
+ uint16_t* dst_uv,
int width) {
int x;
for (x = 0; x < width; ++x) {
@@ -2150,14 +2577,14 @@
}
// C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
int x;
if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width);
@@ -2182,14 +2609,14 @@
}
}
-void InterpolateRow_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
- const uint16* src_ptr1 = src_ptr + src_stride;
+ const uint16_t* src_ptr1 = src_ptr + src_stride;
int x;
if (source_y_fraction == 0) {
memcpy(dst_ptr, src_ptr, width * 2);
@@ -2212,9 +2639,9 @@
}
// Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
int index0 = shuffler[0];
int index1 = shuffler[1];
@@ -2224,10 +2651,10 @@
int x;
for (x = 0; x < width; ++x) {
// To support in-place conversion.
- uint8 b = src_argb[index0];
- uint8 g = src_argb[index1];
- uint8 r = src_argb[index2];
- uint8 a = src_argb[index3];
+ uint8_t b = src_argb[index0];
+ uint8_t g = src_argb[index1];
+ uint8_t r = src_argb[index2];
+ uint8_t a = src_argb[index3];
dst_argb[0] = b;
dst_argb[1] = g;
dst_argb[2] = r;
@@ -2237,10 +2664,10 @@
}
}
-void I422ToYUY2Row_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+void I422ToYUY2Row_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -2261,10 +2688,10 @@
}
}
-void I422ToUYVYRow_C(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+void I422ToUYVYRow_C(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
@@ -2285,8 +2712,8 @@
}
}
-void ARGBPolynomialRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
int i;
@@ -2316,10 +2743,10 @@
dr += poly[14] * r3;
da += poly[15] * a3;
- dst_argb[0] = Clamp((int32)(db));
- dst_argb[1] = Clamp((int32)(dg));
- dst_argb[2] = Clamp((int32)(dr));
- dst_argb[3] = Clamp((int32)(da));
+ dst_argb[0] = Clamp((int32_t)(db));
+ dst_argb[1] = Clamp((int32_t)(dg));
+ dst_argb[2] = Clamp((int32_t)(dr));
+ dst_argb[3] = Clamp((int32_t)(da));
src_argb += 4;
dst_argb += 4;
}
@@ -2335,31 +2762,49 @@
// simply extract the low bits of the exponent and the high
// bits of the mantissa from our float and we're done.
-void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+void HalfFloatRow_C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
int i;
float mult = 1.9259299444e-34f * scale;
for (i = 0; i < width; ++i) {
float value = src[i] * mult;
- dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
+ dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
}
}
-void ARGBLumaColorTableRow_C(const uint8* src_argb,
- uint8* dst_argb,
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ float value = src[i] * scale;
+ dst[i] = value;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff) {
- uint32 bc = lumacoeff & 0xff;
- uint32 gc = (lumacoeff >> 8) & 0xff;
- uint32 rc = (lumacoeff >> 16) & 0xff;
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
+ uint32_t bc = lumacoeff & 0xff;
+ uint32_t gc = (lumacoeff >> 8) & 0xff;
+ uint32_t rc = (lumacoeff >> 16) & 0xff;
int i;
for (i = 0; i < width - 1; i += 2) {
// Luminance in rows, color values in columns.
- const uint8* luma0 =
+ const uint8_t* luma0 =
((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
luma;
- const uint8* luma1;
+ const uint8_t* luma1;
dst_argb[0] = luma0[src_argb[0]];
dst_argb[1] = luma0[src_argb[1]];
dst_argb[2] = luma0[src_argb[2]];
@@ -2376,7 +2821,7 @@
}
if (width & 1) {
// Luminance in rows, color values in columns.
- const uint8* luma0 =
+ const uint8_t* luma0 =
((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
luma;
dst_argb[0] = luma0[src_argb[0]];
@@ -2386,7 +2831,7 @@
}
}
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
dst[3] = src[3];
@@ -2399,7 +2844,7 @@
}
}
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
dst_a[0] = src_argb[3];
@@ -2412,7 +2857,7 @@
}
}
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
int i;
for (i = 0; i < width - 1; i += 2) {
dst[3] = src[0];
@@ -2431,13 +2876,13 @@
#if !(defined(_MSC_VER) && defined(_M_IX86)) && \
defined(HAS_I422TORGB565ROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2452,14 +2897,14 @@
#endif
#if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2474,14 +2919,14 @@
#endif
#if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2496,13 +2941,13 @@
#endif
#if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
@@ -2515,14 +2960,102 @@
}
#endif
-#if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_uv += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ // Row buffer for intermediate ARGB pixels.
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+ while (width > 0) {
+ int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+ NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+ ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+ src_y += twidth;
+ src_vu += twidth;
+ dst_rgb24 += twidth * 3;
+ width -= twidth;
+ }
+}
+#endif
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2541,14 +3074,14 @@
#endif
#if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2567,14 +3100,14 @@
#endif
#if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2593,19 +3126,22 @@
#endif
#if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
- // TODO(fbarchard): ARGBToRGB24Row_AVX2
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
@@ -2616,13 +3152,13 @@
#endif
#if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
- SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+ SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
@@ -2639,6 +3175,175 @@
}
#endif
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+ float fsum = 0.f;
+ int i;
+#if defined(__clang__)
+#pragma clang loop vectorize_width(4)
+#endif
+ for (i = 0; i < width; ++i) {
+ float v = *src++;
+ fsum += v * v;
+ *dst++ = v * scale;
+ }
+ return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+ float fmax = 0.f;
+ int i;
+ for (i = 0; i < width; ++i) {
+ float v = *src++;
+ float vs = v * scale;
+ fmax = (v > fmax) ? v : fmax;
+ *dst++ = vs;
+ }
+ return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src++ * scale;
+ }
+}
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ =
+ (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+ ++src;
+ }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+ }
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ int x;
+ for (x = 0; x < width - 1; x += 2) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ dst_yuv24[3] = src_vu[0]; // V
+ dst_yuv24[4] = src_vu[1]; // U
+ dst_yuv24[5] = src_y[1]; // Y1
+ src_y += 2;
+ src_vu += 2;
+ dst_yuv24 += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ dst_yuv24[0] = src_vu[0]; // V
+ dst_yuv24[1] = src_vu[1]; // U
+ dst_yuv24[2] = src_y[0]; // Y0
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ // Output a row of UV values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_uv += 2;
+ }
+ if (width & 1) {
+ dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ // Output a row of VU values, filtering 2x2 rows of AYUV.
+ int x;
+ for (x = 0; x < width; x += 2) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 4] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 5] + 2) >>
+ 2;
+ src_ayuv += 8;
+ dst_vu += 2;
+ }
+ if (width & 1) {
+ dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+ src_ayuv[src_stride_ayuv + 0] + 2) >>
+ 2;
+ dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+ src_ayuv[src_stride_ayuv + 1] + 2) >>
+ 2;
+ }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_y[x] = src_ayuv[2]; // v,u,y,a
+ src_ayuv += 4;
+ }
+}
+
+void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ uint8_t u = src_uv[0];
+ uint8_t v = src_uv[1];
+ dst_vu[0] = v;
+ dst_vu[1] = u;
+ src_uv += 2;
+ dst_vu += 2;
+ }
+}
+
+// divide values by weights and provide mask to indicate weight of 0.
+void FloatDivToByteRow_C(const float* src_weights,
+ const float* src_values,
+ uint8_t* dst_out,
+ uint8_t* dst_mask,
+ int width) {
+ int x;
+ for (x = 0; x < width; ++x) {
+ dst_out[x] = Clamp(src_values[x] / src_weights[x]);
+ dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/row_gcc.cc b/files/source/row_gcc.cc
index 8735070..decd3d2 100644
--- a/files/source/row_gcc.cc
+++ b/files/source/row_gcc.cc
@@ -22,81 +22,80 @@
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
-static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0};
// JPeg full range.
-static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
- 112, -74, -38, 0, 112, -74, -38, 0};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0};
-static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
- 127, -84, -43, 0, 127, -84, -43, 0};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+ 127, -84, -43, 0, 127, -84, -43, 0};
-static vec8 kARGBToV = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0};
-static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
- -20, -107, 127, 0, -20, -107, 127, 0};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+ -20, -107, 127, 0, -20, -107, 127, 0};
// Constants for BGRA
-static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
- 0, 33, 65, 13, 0, 33, 65, 13};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+ 0, 33, 65, 13, 0, 33, 65, 13};
-static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
- 0, -38, -74, 112, 0, -38, -74, 112};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+ 0, -38, -74, 112, 0, -38, -74, 112};
-static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
- 0, 112, -94, -18, 0, 112, -94, -18};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+ 0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR
-static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
- 33, 65, 13, 0, 33, 65, 13, 0};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+ 33, 65, 13, 0, 33, 65, 13, 0};
-static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
- -38, -74, 112, 0, -38, -74, 112, 0};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+ -38, -74, 112, 0, -38, -74, 112, 0};
-static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
- 112, -94, -18, 0, 112, -94, -18, 0};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+ 112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA.
-static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
- 0, 13, 65, 33, 0, 13, 65, 33};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+ 0, 13, 65, 33, 0, 13, 65, 33};
-static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
- 0, 112, -74, -38, 0, 112, -74, -38};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+ 0, 112, -74, -38, 0, 112, -74, -38};
-static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
- 0, -18, -94, 112, 0, -18, -94, 112};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+ 0, -18, -94, 112, 0, -18, -94, 112};
-static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
// 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
-static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
- 0x8080u, 0x8080u, 0x8080u, 0x8080u};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3
// Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
- 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
// Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
- 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
+ 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
// Shuffle table for converting RAW to RGB24. First 8.
static const uvec8 kShuffleMaskRAWToRGB24_0 = {
@@ -114,15 +113,15 @@
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
+static const uvec8 kShuffleMaskARGBToRGB24 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
+static const uvec8 kShuffleMaskARGBToRAW = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
// YUY2 shuf 16 Y to 32 Y.
@@ -153,392 +152,542 @@
#endif // HAS_RGB24TOARGBROW_SSSE3
#ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm5,%%xmm0 \n"
- "por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm5"
- );
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_J400TOARGBROW_SSE2
#ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x30,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRGB24ToARGB) // %3
- : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
- "pslld $0x18,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x30,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "palignr $0x8,%%xmm1,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm2 \n"
- "palignr $0xc,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "por %%xmm5,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "palignr $0x4,%%xmm3,%%xmm3 \n"
- "pshufb %%xmm4,%%xmm3 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRAWToARGB) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
- "movdqa %5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
- "lea " MEMLEA(0x18,0) ",%0 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
- "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskRAWToRGB24_0), // %3
- "m"(kShuffleMaskRAWToRGB24_1), // %4
- "m"(kShuffleMaskRAWToRGB24_2) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
+ int width) {
+ asm volatile(
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+ "movdqa %5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x4(%0),%%xmm1 \n"
+ "movdqu 0x8(%0),%%xmm2 \n"
+ "lea 0x18(%0),%0 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskRAWToRGB24_0), // %3
+ "m"(kShuffleMaskRAWToRGB24_1), // %4
+ "m"(kShuffleMaskRAWToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x20802080,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xa,%%xmm4 \n"
- "psrlw $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
}
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "mov $0x1080108,%%eax \n"
- "movd %%eax,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x42004200,%%eax \n"
- "movd %%eax,%%xmm6 \n"
- "pshufd $0x0,%%xmm6,%%xmm6 \n"
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psllw $0xb,%%xmm3 \n"
- "movdqa %%xmm3,%%xmm4 \n"
- "psrlw $0x6,%%xmm4 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psllw $0x8,%%xmm7 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psllw $0x1,%%xmm1 \n"
- "psllw $0xb,%%xmm2 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "psllw $0x8,%%xmm1 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "pmulhuw %%xmm6,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
- MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,0x00(%1,%0,2) \n"
+ "movdqu %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+ "xmm6", "xmm7");
}
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "mov $0xf0f0f0f,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x4,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pand %%xmm4,%%xmm0 \n"
- "pand %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "psllw $0x4,%%xmm1 \n"
- "psrlw $0x4,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
- MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,0x00(%1,%0,2) \n"
+ "movdqu %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x30,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRGB24) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "movdqa %3,%%xmm6 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "pshufb %%xmm6,%%xmm0 \n"
- "pshufb %%xmm6,%%xmm1 \n"
- "pshufb %%xmm6,%%xmm2 \n"
- "pshufb %%xmm6,%%xmm3 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "psrldq $0x4,%%xmm1 \n"
- "pslldq $0xc,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm5 \n"
- "por %%xmm4,%%xmm0 \n"
- "pslldq $0x8,%%xmm5 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "por %%xmm5,%%xmm1 \n"
- "psrldq $0x8,%%xmm2 \n"
- "pslldq $0x4,%%xmm3 \n"
- "por %%xmm3,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x30,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "m"(kShuffleMaskARGBToRAW) // %3
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ "movdqa %3,%%xmm6 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "movdqu %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "psrld $0x1b,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1a,%%xmm4 \n"
- "pslld $0x5,%%xmm4 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0xb,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "pslld $0x8,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x5,%%xmm2 \n"
- "psrad $0x10,%%xmm0 \n"
- "pand %%xmm3,%%xmm1 \n"
- "pand %%xmm4,%%xmm2 \n"
- "pand %%xmm5,%%xmm0 \n"
- "por %%xmm2,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRGB24), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u,
+ 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+ 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+ 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+ 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+ 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+ 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+ 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+ 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm5 \n"
+ "vmovdqa %4,%%ymm6 \n"
+ "vmovdqa %5,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n"
+ "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n"
+ "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kPermARGBToRGB24_0), // %3
+ "m"(kPermARGBToRGB24_1), // %4
+ "m"(kPermARGBToRGB24_2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
+
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm6 \n"
+ "vmovdqa %4,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
+ "vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
+ "vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
+ "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
+ "vpermd %%ymm1,%%ymm7,%%ymm1 \n"
+ "vpermd %%ymm2,%%ymm7,%%ymm2 \n"
+ "vpermd %%ymm3,%%ymm7,%%ymm3 \n"
+ "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
+ "vpermq $0x4f,%%ymm2,%%ymm4 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
+ "vpermq $0x93,%%ymm3,%%ymm3 \n"
+ "vpor %%ymm3,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm2,0x40(%1) \n"
+ "lea 0x60(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleMaskARGBToRAW), // %3
+ "m"(kPermdRGB24_AVX) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
+}
+#endif
+
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void ARGBToRGB565DitherRow_SSE2(const uint8* src,
- uint8* dst,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
int width) {
asm volatile(
"movd %3,%%xmm6 \n"
@@ -584,9 +733,9 @@
}
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src,
- uint8* dst,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ const uint32_t dither4,
int width) {
asm volatile(
"vbroadcastss %3,%%xmm6 \n"
@@ -629,153 +778,335 @@
}
#endif // HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrld $0x1b,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "pslld $0x5,%%xmm5 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "pslld $0xa,%%xmm6 \n"
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "pslld $0xf,%%xmm7 \n"
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "psrad $0x10,%%xmm0 \n"
- "psrld $0x3,%%xmm1 \n"
- "psrld $0x6,%%xmm2 \n"
- "psrld $0x9,%%xmm3 \n"
- "pand %%xmm7,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm6,%%xmm3 \n"
- "por %%xmm1,%%xmm0 \n"
- "por %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :: "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0xc,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm3 \n"
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm3,%%xmm0 \n"
- "pand %%xmm4,%%xmm1 \n"
- "psrlq $0x4,%%xmm0 \n"
- "psrlq $0x8,%%xmm1 \n"
- "por %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_RGB24TOARGBROW_SSSE3
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha
+would be a simple multiplier to shift it into position. It wants a gap of 10
+above the green. Green is 10 bits, so there are 6 bits in the low short. 4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24. Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
+ 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u,
+ 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm2 \n" // shuffler for RB
+ "movd %4,%%xmm3 \n" // multipler for RB
+ "movd %5,%%xmm4 \n" // mask for R10 B10
+ "movd %6,%%xmm5 \n" // mask for AG
+ "movd %7,%%xmm6 \n" // multipler for AG
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n" // R0B0
+ "pand %%xmm5,%%xmm0 \n" // A0G0
+ "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
+ "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
+ "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
+ "pslld $10,%%xmm0 \n" // A2 x10 G10 x10
+ "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
+ "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
+ "add $0x10,%0 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleRB30), // %3
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
+ "vbroadcastss %4,%%ymm3 \n" // multipler for RB
+ "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10
+ "vbroadcastss %6,%%ymm5 \n" // mask for AG
+ "vbroadcastss %7,%%ymm6 \n" // multipler for AG
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels
+ "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10
+ "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10
+ "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10
+ "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10
+ "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels
+ "add $0x20,%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleBR30), // %3 reversed shuffler
+ "m"(kMulRB10), // %4
+ "m"(kMaskRB10), // %5
+ "m"(kMaskAG10), // %6
+ "m"(kMulAG10) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
#ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBTOYJROW_SSSE3
@@ -784,153 +1115,149 @@
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "vbroadcastf128 %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea " MEMLEA(0x80,0) ",%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "vbroadcastf128 %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
- "vmovdqu %5,%%ymm6 \n"
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
+ "vmovdqu %5,%%ymm6 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "lea " MEMLEA(0x80,0) ",%0 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
- "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
- "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
- "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64), // %4
- "m"(kPermdARGBToY_AVX) // %5
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
+ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
+ "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64), // %4
+ "m"(kPermdARGBToY_AVX) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBTOYJROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kARGBToV), // %5
- "m"(kARGBToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
- );
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToV), // %5
+ "m"(kARGBToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVROW_SSSE3
@@ -939,643 +1266,644 @@
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
-void ARGBToUVRow_AVX2(const uint8* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vbroadcastf128 %5,%%ymm5 \n"
- "vbroadcastf128 %6,%%ymm6 \n"
- "vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
- VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
- VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
- VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
- VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
- "lea " MEMLEA(0x80,0) ",%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUV128), // %5
- "m"(kARGBToV), // %6
- "m"(kARGBToU), // %7
- "m"(kShufARGBToUV_AVX) // %8
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUV128), // %5
+ "m"(kARGBToV), // %6
+ "m"(kARGBToU), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vbroadcastf128 %5,%%ymm5 \n"
- "vbroadcastf128 %6,%%ymm6 \n"
- "vbroadcastf128 %7,%%ymm7 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vbroadcastf128 %5,%%ymm5 \n"
+ "vbroadcastf128 %6,%%ymm6 \n"
+ "vbroadcastf128 %7,%%ymm7 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
- VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
- VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
- VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
- VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
- "lea " MEMLEA(0x80,0) ",%0 \n"
- "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
- "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
- "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
- "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
- "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
- "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x40(%0),%%ymm2 \n"
+ "vmovdqu 0x60(%0),%%ymm3 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
+ "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
+ "lea 0x80(%0),%0 \n"
+ "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
+ "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
+ "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
+ "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
- "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
- "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
- "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm1,%%ymm1 \n"
- "vpsraw $0x8,%%ymm0,%%ymm0 \n"
- "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpshufb %8,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
+ "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
+ "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
+ "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsraw $0x8,%%ymm0,%%ymm0 \n"
+ "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpshufb %8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kAddUVJ128), // %5
- "m"(kARGBToVJ), // %6
- "m"(kARGBToUJ), // %7
- "m"(kShufARGBToUV_AVX) // %8
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ "vextractf128 $0x0,%%ymm0,(%1) \n"
+ "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kAddUVJ128), // %5
+ "m"(kARGBToVJ), // %6
+ "m"(kARGBToUJ), // %7
+ "m"(kShufARGBToUV_AVX) // %8
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBTOUVJROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_argb)), // %4
- "m"(kARGBToVJ), // %5
- "m"(kARGBToUJ), // %6
- "m"(kAddUVJ128) // %7
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
- );
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_argb)), // %4
+ "m"(kARGBToVJ), // %5
+ "m"(kARGBToUJ), // %6
+ "m"(kAddUVJ128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVJROW_SSSE3
#ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %4,%%xmm3 \n"
- "movdqa %5,%%xmm4 \n"
- "movdqa %6,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %4,%%xmm3 \n"
+ "movdqa %5,%%xmm4 \n"
+ "movdqa %6,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm2 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm2 \n"
- "packsswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "m"(kARGBToV), // %4
- "m"(kARGBToU), // %5
- "m"(kAddUV128) // %6
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "m"(kARGBToV), // %4
+ "m"(kARGBToU), // %5
+ "m"(kAddUV128) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
}
#endif // HAS_ARGBTOUV444ROW_SSSE3
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kBGRAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_bgra0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_bgra)), // %4
- "m"(kBGRAToV), // %5
- "m"(kBGRAToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
- );
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_bgra)), // %4
+ "m"(kBGRAToV), // %5
+ "m"(kBGRAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kABGRToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
- asm volatile (
- "movdqa %4,%%xmm5 \n"
- "movdqa %3,%%xmm4 \n"
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "phaddw %%xmm3,%%xmm2 \n"
- "psrlw $0x7,%%xmm0 \n"
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- : "m"(kRGBAToY), // %3
- "m"(kAddY16) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_abgr0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_abgr)), // %4
- "m"(kABGRToV), // %5
- "m"(kABGRToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
- );
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_abgr)), // %4
+ "m"(kABGRToV), // %5
+ "m"(kABGRToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movdqa %5,%%xmm3 \n"
- "movdqa %6,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %5,%%xmm3 \n"
+ "movdqa %6,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm7 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm7 \n"
- "shufps $0x88,%%xmm6,%%xmm2 \n"
- "shufps $0xdd,%%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "phaddw %%xmm2,%%xmm0 \n"
- "phaddw %%xmm6,%%xmm1 \n"
- "psraw $0x8,%%xmm0 \n"
- "psraw $0x8,%%xmm1 \n"
- "packsswb %%xmm1,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
- "movlps %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_rgba0), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+rm"(width) // %3
- : "r"((intptr_t)(src_stride_rgba)), // %4
- "m"(kRGBAToV), // %5
- "m"(kRGBAToU), // %6
- "m"(kAddUV128) // %7
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
- );
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"((intptr_t)(src_stride_rgba)), // %4
+ "m"(kRGBAToV), // %5
+ "m"(kRGBAToU), // %6
+ "m"(kAddUV128) // %7
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
// Read 8 UV from 444
-#define READYUV444 \
- "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+#define READYUV444 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV
-#define READYUV422 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+#define READYUV422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklwd %%xmm1,%%xmm0 \n" \
+ "psraw $0x2,%%xmm0 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movdqu (%[y_buf]),%%xmm4 \n" \
+ "psllw $0x6,%%xmm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
- "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
- "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
+#define READYUVA422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "movq (%[a_buf]),%%xmm5 \n" \
+ "lea 0x8(%[a_buf]),%[a_buf] \n"
// Read 4 UV from NV12, upsample to 8 UV
-#define READNV12 \
- "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+#define READNV12 \
+ "movq (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 VU from NV21, upsample to 8 UV
-#define READNV21 \
- "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
- "pshufb %[kShuffleNV21], %%xmm0 \n" \
- "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "punpcklbw %%xmm4,%%xmm4 \n" \
- "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
+#define READNV21 \
+ "movq (%[vu_buf]),%%xmm0 \n" \
+ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \
+ "pshufb %[kShuffleNV21], %%xmm0 \n" \
+ "movq (%[y_buf]),%%xmm4 \n" \
+ "punpcklbw %%xmm4,%%xmm4 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2 \
- "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
- "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
- "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
- "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
- "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
+#define READYUY2 \
+ "movdqu (%[yuy2_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
+ "movdqu (%[yuy2_buf]),%%xmm0 \n" \
+ "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
+ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n"
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY \
- "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
- "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
- "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
- "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
- "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
+#define READUYVY \
+ "movdqu (%[uyvy_buf]),%%xmm4 \n" \
+ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
+ "movdqu (%[uyvy_buf]),%%xmm0 \n" \
+ "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
+ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n"
#if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants) \
- "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
- "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
- "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
- "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
- "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
- "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
- "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
+#define YUVTORGB_SETUP(yuvconstants) \
+ "movdqa (%[yuvconstants]),%%xmm8 \n" \
+ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \
+ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \
+ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm12 \n" \
+ "movdqa 160(%[yuvconstants]),%%xmm13 \n" \
+ "movdqa 192(%[yuvconstants]),%%xmm14 \n"
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
+#define YUVTORGB16(yuvconstants) \
"movdqa %%xmm0,%%xmm1 \n" \
"movdqa %%xmm0,%%xmm2 \n" \
"movdqa %%xmm0,%%xmm3 \n" \
@@ -1591,72 +1919,95 @@
"pmulhuw %%xmm14,%%xmm4 \n" \
"paddsw %%xmm4,%%xmm0 \n" \
"paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
+ "paddsw %%xmm4,%%xmm2 \n"
#define YUVTORGB_REGS \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
#else
#define YUVTORGB_SETUP(yuvconstants)
// Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm3 \n" \
- "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
- "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
- "psubw %%xmm1,%%xmm0 \n" \
- "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
- "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
- "psubw %%xmm2,%%xmm1 \n" \
- "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
- "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
- "psubw %%xmm3,%%xmm2 \n" \
- "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
- "paddsw %%xmm4,%%xmm0 \n" \
- "paddsw %%xmm4,%%xmm1 \n" \
- "paddsw %%xmm4,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n"
+#define YUVTORGB16(yuvconstants) \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "movdqa 96(%[yuvconstants]),%%xmm0 \n" \
+ "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \
+ "psubw %%xmm1,%%xmm0 \n" \
+ "movdqa 128(%[yuvconstants]),%%xmm1 \n" \
+ "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \
+ "psubw %%xmm2,%%xmm1 \n" \
+ "movdqa 160(%[yuvconstants]),%%xmm2 \n" \
+ "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \
+ "psubw %%xmm3,%%xmm2 \n" \
+ "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \
+ "paddsw %%xmm4,%%xmm0 \n" \
+ "paddsw %%xmm4,%%xmm1 \n" \
+ "paddsw %%xmm4,%%xmm2 \n"
#define YUVTORGB_REGS
#endif
+#define YUVTORGB(yuvconstants) \
+ YUVTORGB16(yuvconstants) \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
+
// Store 8 ARGB values.
-#define STOREARGB \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklbw %%xmm5,%%xmm2 \n" \
- "movdqa %%xmm0,%%xmm1 \n" \
- "punpcklwd %%xmm2,%%xmm0 \n" \
- "punpckhwd %%xmm2,%%xmm1 \n" \
- "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
- "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
- "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
+#define STOREARGB \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklbw %%xmm5,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm1 \n" \
+ "movdqu %%xmm0,(%[dst_argb]) \n" \
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n" \
+ "lea 0x20(%[dst_argb]), %[dst_argb] \n"
// Store 8 RGBA values.
-#define STORERGBA \
- "pcmpeqb %%xmm5,%%xmm5 \n" \
- "punpcklbw %%xmm2,%%xmm1 \n" \
- "punpcklbw %%xmm0,%%xmm5 \n" \
- "movdqa %%xmm5,%%xmm0 \n" \
- "punpcklwd %%xmm1,%%xmm5 \n" \
- "punpckhwd %%xmm1,%%xmm0 \n" \
- "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
- "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
- "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
+#define STORERGBA \
+ "pcmpeqb %%xmm5,%%xmm5 \n" \
+ "punpcklbw %%xmm2,%%xmm1 \n" \
+ "punpcklbw %%xmm0,%%xmm5 \n" \
+ "movdqa %%xmm5,%%xmm0 \n" \
+ "punpcklwd %%xmm1,%%xmm5 \n" \
+ "punpckhwd %%xmm1,%%xmm0 \n" \
+ "movdqu %%xmm5,(%[dst_rgba]) \n" \
+ "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \
+ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+// Store 8 AR30 values.
+#define STOREAR30 \
+ "psraw $0x4,%%xmm0 \n" \
+ "psraw $0x4,%%xmm1 \n" \
+ "psraw $0x4,%%xmm2 \n" \
+ "pminsw %%xmm7,%%xmm0 \n" \
+ "pminsw %%xmm7,%%xmm1 \n" \
+ "pminsw %%xmm7,%%xmm2 \n" \
+ "pmaxsw %%xmm6,%%xmm0 \n" \
+ "pmaxsw %%xmm6,%%xmm1 \n" \
+ "pmaxsw %%xmm6,%%xmm2 \n" \
+ "psllw $0x4,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm3 \n" \
+ "punpcklwd %%xmm2,%%xmm0 \n" \
+ "punpckhwd %%xmm2,%%xmm3 \n" \
+ "movdqa %%xmm1,%%xmm2 \n" \
+ "punpcklwd %%xmm5,%%xmm1 \n" \
+ "punpckhwd %%xmm5,%%xmm2 \n" \
+ "pslld $0xa,%%xmm1 \n" \
+ "pslld $0xa,%%xmm2 \n" \
+ "por %%xmm1,%%xmm0 \n" \
+ "por %%xmm2,%%xmm3 \n" \
+ "movdqu %%xmm0,(%[dst_ar30]) \n" \
+ "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \
+ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -1677,15 +2028,15 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -1706,9 +2057,9 @@
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm6,%%xmm1 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
- "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
- "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
"subl $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -1723,15 +2074,15 @@
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
[kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
}
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -1752,17 +2103,116 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV422
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB(yuvconstants)
+ STOREARGB
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $14,%%xmm5 \n"
+ "psllw $4,%%xmm5 \n" // 2 alpha bits
+ "pxor %%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
+ "psrlw $6,%%xmm7 \n" // 1023 for max
+
+ LABELALIGN
+ "1: \n"
+ READYUV210
+ YUVTORGB16(yuvconstants)
+ STOREAR30
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+
#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -1788,16 +2238,16 @@
[width]"+rm"(width) // %[width]
#endif
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
}
#endif // HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -1817,15 +2267,15 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
}
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -1846,14 +2296,14 @@
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
}
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
- uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -1874,14 +2324,14 @@
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleYUY2Y]"m"(kShuffleYUY2Y),
[kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
}
-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
- uint8* dst_argb,
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -1902,16 +2352,16 @@
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleUYVYY]"m"(kShuffleUYVYY),
[kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
}
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -1932,7 +2382,7 @@
[dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS
+ : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -1940,96 +2390,113 @@
#endif // HAS_I422TOARGBROW_SSSE3
// Read 16 UV from 444
-#define READYUV444_AVX2 \
- "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+#define READYUV444_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+#define READYUV422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2 \
+ "vmovdqu (%[u_buf]),%%xmm0 \n" \
+ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x10(%[u_buf]),%[u_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x2,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%ymm4 \n" \
+ "vpsllw $0x6,%%ymm4,%%ymm4 \n" \
+ "lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
-#define READYUVA422_AVX2 \
- "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
- MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
- "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
- "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
- "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
- "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
+#define READYUVA422_AVX2 \
+ "vmovq (%[u_buf]),%%xmm0 \n" \
+ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n" \
+ "vmovdqu (%[a_buf]),%%xmm5 \n" \
+ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
+ "lea 0x10(%[a_buf]),%[a_buf] \n"
// Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 \
- "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+#define READNV12_AVX2 \
+ "vmovdqu (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2 \
- "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
- "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
- "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
- "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
- "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
- "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
+#define READNV21_AVX2 \
+ "vmovdqu (%[vu_buf]),%%xmm0 \n" \
+ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
+ "vmovdqu (%[y_buf]),%%xmm4 \n" \
+ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
+ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
+ "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 \
- "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
- "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
- "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
- "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
- "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
+#define READYUY2_AVX2 \
+ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[yuy2_buf]),%%ymm0 \n" \
+ "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
+ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 \
- "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
- "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
- "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
- "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
- "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
+#define READUYVY_AVX2 \
+ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \
+ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
+ "vmovdqu (%[uyvy_buf]),%%ymm0 \n" \
+ "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
+ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
#if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants) \
- "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
- "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
- "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
- "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
- "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
- "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
- "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants) \
+ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \
+ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \
+ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \
+ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \
+ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" \
+ "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \
+ "vmovdqa 192(%[yuvconstants]),%%ymm14 \n"
-#define YUVTORGB_AVX2(yuvconstants) \
+#define YUVTORGB16_AVX2(yuvconstants) \
"vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
"vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
"vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
@@ -2039,13 +2506,7 @@
"vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
"vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
"vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2 \
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
@@ -2053,48 +2514,78 @@
#else // Convert 16 pixels: 16 UV and 16 Y.
#define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants) \
- "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
- "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
- "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
- "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
- "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
- "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
- "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
- "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
- "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
- "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
- "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
- "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
- "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
- "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
- "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
- "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
- "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+#define YUVTORGB16_AVX2(yuvconstants) \
+ "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \
+ "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \
+ "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \
+ "vmovdqu 160(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
+ "vmovdqu 128(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
+ "vmovdqu 96(%[yuvconstants]),%%ymm3 \n" \
+ "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
+ "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \
+ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
+ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
+ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n"
#define YUVTORGB_REGS_AVX2
#endif
+#define YUVTORGB_AVX2(yuvconstants) \
+ YUVTORGB16_AVX2(yuvconstants) \
+ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
+ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
+
// Store 16 ARGB values.
-#define STOREARGB_AVX2 \
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
- "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
- "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
- "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
- "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
- "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
- "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
- "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
- "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
+#define STOREARGB_AVX2 \
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vmovdqu %%ymm1,(%[dst_argb]) \n" \
+ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \
+ "lea 0x40(%[dst_argb]), %[dst_argb] \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2 \
+ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \
+ "vpsraw $0x4,%%ymm1,%%ymm1 \n" \
+ "vpsraw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \
+ "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \
+ "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \
+ "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \
+ "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \
+ "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \
+ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
+ "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \
+ "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \
+ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \
+ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm1,%%ymm1 \n" \
+ "vpslld $0xa,%%ymm2,%%ymm2 \n" \
+ "vpor %%ymm1,%%ymm0,%%ymm0 \n" \
+ "vpor %%ymm2,%%ymm3,%%ymm3 \n" \
+ "vmovdqu %%ymm0,(%[dst_ar30]) \n" \
+ "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \
+ "lea 0x40(%[dst_ar30]), %[dst_ar30] \n"
#ifdef HAS_I444TOARGBROW_AVX2
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -2116,7 +2607,7 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -2125,10 +2616,10 @@
#if defined(HAS_I422TOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -2151,20 +2642,135 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_I422TOARGBROW_AVX2
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV422_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
+}
+#endif // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_argb,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB_AVX2(yuvconstants)
+ STOREARGB_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I210TOARGBROW_AVX2
+
+#if defined(HAS_I210TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+ const uint16_t* u_buf,
+ const uint16_t* v_buf,
+ uint8_t* dst_ar30,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP_AVX2(yuvconstants)
+ "sub %[u_buf],%[v_buf] \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
+ "vpsrlw $14,%%ymm5,%%ymm5 \n"
+ "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
+ "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min
+ "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max
+ "vpsrlw $6,%%ymm7,%%ymm7 \n"
+
+ LABELALIGN
+ "1: \n"
+ READYUV210_AVX2
+ YUVTORGB16_AVX2(yuvconstants)
+ STOREAR30_AVX2
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
+
+ "vzeroupper \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
+ [width]"+rm"(width) // %[width]
+ : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
+ : "memory", "cc", YUVTORGB_REGS_AVX2
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+ );
+}
+#endif // HAS_I210TOAR30ROW_AVX2
+
#if defined(HAS_I422ALPHATOARGBROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -2191,7 +2797,7 @@
[width]"+rm"(width) // %[width]
#endif
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
@@ -2201,10 +2807,10 @@
#if defined(HAS_I422TORGBAROW_AVX2)
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -2224,11 +2830,11 @@
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
"vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
- "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
- "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
- "sub $0x10,%[width] \n"
- "jg 1b \n"
+ "vmovdqu %%ymm0,(%[dst_argb]) \n"
+ "vmovdqu %%ymm1,0x20(%[dst_argb]) \n"
+ "lea 0x40(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x10,%[width] \n"
+ "jg 1b \n"
"vzeroupper \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
@@ -2236,7 +2842,7 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
@@ -2245,9 +2851,9 @@
#if defined(HAS_NV12TOARGBROW_AVX2)
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -2268,7 +2874,7 @@
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
@@ -2278,9 +2884,9 @@
#if defined(HAS_NV21TOARGBROW_AVX2)
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -2302,7 +2908,7 @@
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleNV21]"m"(kShuffleNV21)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
@@ -2312,8 +2918,8 @@
#if defined(HAS_YUY2TOARGBROW_AVX2)
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
- uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -2335,7 +2941,7 @@
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleYUY2Y]"m"(kShuffleYUY2Y),
[kShuffleYUY2UV]"m"(kShuffleYUY2UV)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
@@ -2345,8 +2951,8 @@
#if defined(HAS_UYVYTOARGBROW_AVX2)
// 16 pixels.
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
- uint8* dst_argb,
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// clang-format off
@@ -2368,7 +2974,7 @@
: [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
[kShuffleUYVYY]"m"(kShuffleUYVYY),
[kShuffleUYVYUV]"m"(kShuffleUYVYUV)
- : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
+ : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
// clang-format on
@@ -2376,552 +2982,957 @@
#endif // HAS_UYVYTOARGBROW_AVX2
#ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
- asm volatile (
- "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
- "movd %%eax,%%xmm2 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
- "movd %%eax,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 *
+ // 16
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
- LABELALIGN
- "1: \n"
- // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "psubusw %%xmm3,%%xmm0 \n"
- "psrlw $6, %%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "psrlw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
- // Step 2: Weave into ARGB
- "punpcklbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm0,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "por %%xmm4,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
- );
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_SSE2
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
- asm volatile (
- "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
- "vmovd %%eax,%%xmm2 \n"
- "vbroadcastss %%xmm2,%%ymm2 \n"
- "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
- "vmovd %%eax,%%xmm3 \n"
- "vbroadcastss %%xmm3,%%ymm3 \n"
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpslld $0x18,%%ymm4,%%ymm4 \n"
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 *
+ // 16
+ "vmovd %%eax,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+ "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
+ "vmovd %%eax,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpslld $0x18,%%ymm4,%%ymm4 \n"
- LABELALIGN
- "1: \n"
- // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
- "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
- "vpor %%ymm4,%%ymm0,%%ymm0 \n"
- "vpor %%ymm4,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(y_buf), // %0
- "+r"(dst_argb), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc", "eax"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
- );
+ LABELALIGN
+ "1: \n"
+ // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+ "vmovdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpor %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm4,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_I400TOARGBROW_AVX2
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
- 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "movdqa %3,%%xmm5 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
- "pshufb %%xmm5,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm5"
- );
+ "movdqa %3,%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu -0x10(%0,%2,1),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "vbroadcastf128 %3,%%ymm5 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpermq $0x4e,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kShuffleMirror) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm5"
- );
+ "vbroadcastf128 %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu -0x20(%0,%2,1),%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpermq $0x4e,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
- 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-void MirrorUVRow_SSSE3(const uint8* src,
- uint8* dst_u,
- uint8* dst_v,
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+ 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "movdqa %4,%%xmm1 \n"
- "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "movdqa %4,%%xmm1 \n"
+ "lea -0x10(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- "pshufb %%xmm1,%%xmm0 \n"
- "movlpd %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $8,%3 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(temp_width) // %3
- : "m"(kShuffleMirrorUV) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "pshufd $0x1b,%%xmm0,%%xmm0 \n"
- "lea " MEMLEA(-0x10,0) ",%0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- :
- : "memory", "cc"
- , "xmm0"
- );
+ "lea -0x10(%0,%2,4),%0 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufd $0x1b,%%xmm0,%%xmm0 \n"
+ "lea -0x10(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc", "xmm0");
}
#endif // HAS_ARGBMIRRORROW_SSE2
#ifdef HAS_ARGBMIRRORROW_AVX2
// Shuffle table for reversing the bytes.
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width);
- asm volatile (
- "vmovdqu %3,%%ymm5 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(temp_width) // %2
- : "m"(kARGBShuffleMirror_AVX2) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm5"
- );
+ "vmovdqu %3,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm2,0x00(%1,%2,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
- asm volatile (
- "sub %0,%1 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
- "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
- "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
- "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
- "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2"
- );
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
+ "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
- asm volatile (
- "sub %0,%1 \n"
+ asm volatile(
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm1,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2"
- );
+ "sub %0,%1 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_SSE2
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+ const uint16_t* src_v,
+ uint16_t* dst_uv,
+ int scale,
+ int width) {
+ // clang-format off
asm volatile (
- "test $0xf,%0 \n"
- "jne 2f \n"
- "test $0xf,%1 \n"
- "jne 2f \n"
+ "vmovd %4,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+ // 16 pixels per loop.
LABELALIGN
"1: \n"
- "movdqa " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0," MEMACCESS(1) " \n"
- "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu (%0,%1,1),%%ymm1 \n"
+ "add $0x20,%0 \n"
+
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
+ "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm2,(%2) \n"
+ "vextractf128 $0x0,%%ymm0,0x10(%2) \n"
+ "vextractf128 $0x1,%%ymm2,0x20(%2) \n"
+ "vextractf128 $0x1,%%ymm0,0x30(%2) \n"
+ "add $0x40,%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ : "r"(scale) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm3 \n"
+ "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
+ "vbroadcastss %%xmm3,%%ymm3 \n"
+ "sub %0,%1 \n"
+
+ // 16 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpmullw %%ymm3,%%ymm0,%%ymm0 \n"
+ "vpmullw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%0,%1) \n"
+ "vmovdqu %%ymm1,0x20(%0,%1) \n"
+ "add $0x40,%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
- "jmp 9f \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm3");
+ // clang-format on
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
LABELALIGN
- "2: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "add $0x20,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "add $0x10,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+ uint8_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "add $0x40,%0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "add $0x20,%1 \n"
"sub $0x20,%2 \n"
- "jg 2b \n"
- "9: \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "add $0x20,%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+ uint16_t* dst_y,
+ int scale,
+ int width) {
+ // clang-format off
+ asm volatile (
+ "vmovd %3,%%xmm2 \n"
+ "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n"
+ "vbroadcastss %%xmm2,%%ymm2 \n"
+
+ // 32 pixels per loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "add $0x40,%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ : "r"(scale) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+ // clang-format on
+}
+#endif // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 2u, 5u, 8u, 11u, 14u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 1u,
+ 4u, 7u, 10u, 13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+ 3u, 6u, 9u, 12u, 15u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 2u,
+ 5u, 8u, 11u, 14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+ 4u, 7u, 10u, 13u, 128u, 128u,
+ 128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 0u, 3u,
+ 6u, 9u, 12u, 15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "lea 0x10(%3),%3 \n"
+ "lea 0x30(%0),%0 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskRGBToR0), // %5
+ "m"(kShuffleMaskRGBToR1), // %6
+ "m"(kShuffleMaskRGBToR2), // %7
+ "m"(kShuffleMaskRGBToG0), // %8
+ "m"(kShuffleMaskRGBToG1), // %9
+ "m"(kShuffleMaskRGBToG2), // %10
+ "m"(kShuffleMaskRGBToB0), // %11
+ "m"(kShuffleMaskRGBToB1), // %12
+ "m"(kShuffleMaskRGBToB2) // %13
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+ 2u, 128u, 128u, 3u, 128u, 128u,
+ 4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+ 128u, 2u, 128u, 128u, 3u, 128u,
+ 128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+ 128u, 128u, 2u, 128u, 128u, 3u,
+ 128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+ 7u, 128u, 128u, 8u, 128u, 128u,
+ 9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+ 128u, 7u, 128u, 128u, 8u, 128u,
+ 128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u,
+ 128u, 128u, 8u, 128u, 128u, 9u,
+ 128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+ 12u, 128u, 128u, 13u, 128u, 128u,
+ 14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+ 128u, 13u, 128u, 128u, 14u, 128u,
+ 128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+ 128u, 128u, 13u, 128u, 128u, 14u,
+ 128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %5, %%xmm0 \n"
+ "pshufb %6, %%xmm1 \n"
+ "pshufb %7, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %8, %%xmm0 \n"
+ "pshufb %9, %%xmm1 \n"
+ "pshufb %10, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,16(%3) \n"
+
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "pshufb %11, %%xmm0 \n"
+ "pshufb %12, %%xmm1 \n"
+ "pshufb %13, %%xmm2 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,32(%3) \n"
+
+ "lea 0x10(%0),%0 \n"
+ "lea 0x10(%1),%1 \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x30(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : "m"(kShuffleMaskRToRGB0), // %5
+ "m"(kShuffleMaskGToRGB0), // %6
+ "m"(kShuffleMaskBToRGB0), // %7
+ "m"(kShuffleMaskRToRGB1), // %8
+ "m"(kShuffleMaskGToRGB1), // %9
+ "m"(kShuffleMaskBToRGB1), // %10
+ "m"(kShuffleMaskRToRGB2), // %11
+ "m"(kShuffleMaskGToRGB2), // %12
+ "m"(kShuffleMaskBToRGB2) // %13
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "test $0xf,%0 \n"
+ "jne 2f \n"
+ "test $0xf,%1 \n"
+ "jne 2f \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 9f \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 2b \n"
+
+ LABELALIGN "9: \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x40,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x40,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_ERMS
// Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
- : "+S"(src), // %0
- "+D"(dst), // %1
- "+c"(width_tmp) // %2
- :
- : "memory", "cc");
+ asm volatile(
+
+ "rep movsb \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc");
}
#endif // HAS_COPYROW_ERMS
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm4 \n"
- "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2," MEMACCESS(1) " \n"
- "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBCOPYALPHAROW_SSE2
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1," MEMACCESS(1) " \n"
- "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "vmovdqu 0x20(%0),%%ymm2 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_ARGBCOPYALPHAROW_AVX2
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
// width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ", %%xmm0 \n"
- "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
- "lea " MEMLEA(0x20, 0) ", %0 \n"
- "psrld $0x18, %%xmm0 \n"
- "psrld $0x18, %%xmm1 \n"
- "packssdw %%xmm1, %%xmm0 \n"
- "packuswb %%xmm0, %%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8, 1) ", %1 \n"
- "sub $0x8, %2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+rm"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0), %%xmm0 \n"
+ "movdqu 0x10(%0), %%xmm1 \n"
+ "lea 0x20(%0), %0 \n"
+ "psrld $0x18, %%xmm0 \n"
+ "psrld $0x18, %%xmm1 \n"
+ "packssdw %%xmm1, %%xmm0 \n"
+ "packuswb %%xmm0, %%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1), %1 \n"
+ "sub $0x8, %2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_ARGBEXTRACTALPHAROW_SSE2
@@ -2930,657 +3941,636 @@
3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
-void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- "vmovdqa %3,%%ymm4 \n"
- "vbroadcastf128 %4,%%ymm5 \n"
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "vmovdqa %3,%%ymm4 \n"
+ "vbroadcastf128 %4,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
- "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
- "lea " MEMLEA(0x80, 0) ", %0 \n"
- "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
- "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
- "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20, %2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+rm"(width) // %2
- : "m"(kPermdARGBToY_AVX), // %3
- "m"(kShuffleAlphaShort_AVX2) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0), %%ymm0 \n"
+ "vmovdqu 0x20(%0), %%ymm1 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x40(%0), %%ymm2 \n"
+ "vmovdqu 0x60(%0), %%ymm3 \n"
+ "lea 0x80(%0), %0 \n"
+ "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
+ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
+ "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20, %2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+rm"(width) // %2
+ : "m"(kPermdARGBToY_AVX), // %3
+ "m"(kShuffleAlphaShort_AVX2) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBEXTRACTALPHAROW_AVX2
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "pcmpeqb %%xmm0,%%xmm0 \n"
- "pslld $0x18,%%xmm0 \n"
- "pcmpeqb %%xmm1,%%xmm1 \n"
- "psrld $0x8,%%xmm1 \n"
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpckhwd %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm2,%%xmm2 \n"
- "movdqu " MEMACCESS(1) ",%%xmm4 \n"
- "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
- "pand %%xmm0,%%xmm2 \n"
- "pand %%xmm0,%%xmm3 \n"
- "pand %%xmm1,%%xmm4 \n"
- "pand %%xmm1,%%xmm5 \n"
- "por %%xmm4,%%xmm2 \n"
- "por %%xmm5,%%xmm3 \n"
- "movdqu %%xmm2," MEMACCESS(1) " \n"
- "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm2 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqu (%1),%%xmm4 \n"
+ "movdqu 0x10(%1),%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
- asm volatile (
- "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
- LABELALIGN
- "1: \n"
- "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
- "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "vpslld $0x18,%%ymm1,%%ymm1 \n"
- "vpslld $0x18,%%ymm2,%%ymm2 \n"
- "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
- "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
- "vmovdqu %%ymm1," MEMACCESS(1) " \n"
- "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2"
- );
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm1 \n"
+ "vpmovzxbd 0x8(%0),%%ymm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1,(%1) \n"
+ "vmovdqu %%ymm2,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint8 v8, int width) {
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width >> 2);
- const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
- asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
+ const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
}
-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
- : "+D"(dst), // %0
- "+c"(width_tmp) // %1
- : "a"(v8) // %2
- : "memory", "cc");
+ asm volatile(
+
+ "rep stosb \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v8) // %2
+ : "memory", "cc");
}
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width);
- asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
- : "+D"(dst_argb), // %0
- "+c"(width_tmp) // %1
- : "a"(v32) // %2
- : "memory", "cc");
+ asm volatile(
+
+ "rep stosl \n"
+ : "+D"(dst_argb), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
}
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
-void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pand %%xmm5,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x00(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_YUY2TOYROW_SSE2
#ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
- VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_yuy2)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_yuy2)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm5"
- );
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
- VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((intptr_t)(stride_uyvy)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
+ "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((intptr_t)(stride_uyvy)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
- "sub %1,%2 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
+ "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm1,%%ymm1 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
- VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x20,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm1,%%ymm1 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vextractf128 $0x0,%%ymm1,(%1) \n"
+ "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x20,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
- 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
+static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0xf,%%xmm7 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x8,%%xmm6 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psllw $0x8,%%xmm5 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "pslld $0x18,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
+ asm volatile(
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
- // 4 pixel loop.
- LABELALIGN
- "40: \n"
- "movdqu " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
- "49: \n"
- "add $0x3,%3 \n"
- "jl 99f \n"
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
- // 1 pixel loop.
- "91: \n"
- "movd " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "movdqa %%xmm3,%%xmm0 \n"
- "pxor %%xmm4,%%xmm3 \n"
- "movd " MEMACCESS(1) ",%%xmm2 \n"
- "pshufb %4,%%xmm3 \n"
- "pand %%xmm6,%%xmm2 \n"
- "paddw %%xmm7,%%xmm3 \n"
- "pmullw %%xmm3,%%xmm2 \n"
- "movd " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "psrlw $0x8,%%xmm1 \n"
- "por %%xmm4,%%xmm0 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "psrlw $0x8,%%xmm2 \n"
- "paddusb %%xmm2,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "sub $0x1,%3 \n"
- "jge 91b \n"
- "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : "m"(kShuffleAlpha) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBBLENDROW_SSSE3
@@ -3590,10 +4580,10 @@
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n"
@@ -3642,10 +4632,10 @@
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
@@ -3699,50 +4689,50 @@
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
- 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
-static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
- 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+ 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
// Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "pcmpeqb %%xmm3,%%xmm3 \n"
- "pslld $0x18,%%xmm3 \n"
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "punpcklbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm1,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "punpckhbw %%xmm2,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "pand %%xmm3,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha0), // %3
- "m"(kShuffleAlpha1) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBATTENUATEROW_SSSE3
@@ -3752,87 +4742,85 @@
128u, 128u, 14u, 15u, 14u, 15u,
14u, 15u, 128u, 128u};
// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "vbroadcastf128 %3,%%ymm4 \n"
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpslld $0x18,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vbroadcastf128 %3,%%ymm4 \n"
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpslld $0x18,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
- "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpand %%ymm5,%%ymm6,%%ymm6 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpor %%ymm6,%%ymm0,%%ymm0 \n"
- MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kShuffleAlpha_AVX2) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
+ "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpand %%ymm5,%%ymm6,%%ymm6 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpor %%ymm6,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha_AVX2) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBATTENUATEROW_AVX2
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
uintptr_t alpha;
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movzb " MEMACCESS2(0x03,0) ",%3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x07,0) ",%3 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
- MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "movlhps %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
- : "r"(fixed_invtbl8) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movzb 0x03(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x07(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "movd 0x00(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBUNATTENUATEROW_SSE2
@@ -3841,114 +4829,111 @@
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
// Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
uintptr_t alpha;
- asm volatile (
- "sub %0,%1 \n"
- "vbroadcastf128 %5,%%ymm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "vbroadcastf128 %5,%%ymm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- // replace VPGATHER
- "movzb " MEMACCESS2(0x03,0) ",%3 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
- "movzb " MEMACCESS2(0x07,0) ",%3 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
- "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
- "movzb " MEMACCESS2(0x13,0) ",%3 \n"
- "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
- "movzb " MEMACCESS2(0x17,0) ",%3 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
- "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
- "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
- "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
- MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
- "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
- "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
- "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
- "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
- // end of VPGATHER
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ // replace VPGATHER
+ "movzb 0x03(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x07(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x0b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x0f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "movzb 0x13(%0),%3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm0 \n"
+ "movzb 0x17(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm1 \n"
+ "movzb 0x1b(%0),%3 \n"
+ "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm2 \n"
+ "movzb 0x1f(%0),%3 \n"
+ "vmovd 0x00(%4,%3,4),%%xmm3 \n"
+ "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
+ "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
+ "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
+ "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
+ // end of VPGATHER
- "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
- "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
- "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
- "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
- "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
- "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
- "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width), // %2
- "=&r"(alpha) // %3
- : "r"(fixed_invtbl8), // %4
- "m"(kUnattenShuffleAlpha_AVX2) // %5
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ "vmovdqu (%0),%%ymm6 \n"
+ "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
+ "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
+ "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
+ "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
+ "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
+ "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "=&r"(alpha) // %3
+ : "r"(fixed_invtbl8), // %4
+ "m"(kUnattenShuffleAlpha_AVX2) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBUNATTENUATEROW_AVX2
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrld $0x18,%%xmm2 \n"
- "psrld $0x18,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpcklbw %%xmm2,%%xmm3 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm2 \n"
+ "movdqu 0x10(%0),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBGRAYROW_SSSE3
@@ -3957,306 +4942,301 @@
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
// Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
- 17, 68, 35, 0, 17, 68, 35, 0};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+ 17, 68, 35, 0, 17, 68, 35, 0};
-static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
- 22, 88, 45, 0, 22, 88, 45, 0};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+ 22, 88, 45, 0, 22, 88, 45, 0};
-static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
- 24, 98, 50, 0, 24, 98, 50, 0};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+ 24, 98, 50, 0, 24, 98, 50, 0};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
- asm volatile (
- "movdqa %2,%%xmm2 \n"
- "movdqa %3,%%xmm3 \n"
- "movdqa %4,%%xmm4 \n"
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "phaddw %%xmm6,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm5 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm5 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm5 \n"
- "psrlw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqu " MEMACCESS(0) ",%%xmm6 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(0) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "sub $0x8,%1 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "m"(kARGBToSepiaB), // %2
- "m"(kARGBToSepiaG), // %3
- "m"(kARGBToSepiaR) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqu (%0),%%xmm5 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBSEPIAROW_SSSE3
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
- asm volatile (
- "movdqu " MEMACCESS(3) ",%%xmm5 \n"
- "pshufd $0x00,%%xmm5,%%xmm2 \n"
- "pshufd $0x55,%%xmm5,%%xmm3 \n"
- "pshufd $0xaa,%%xmm5,%%xmm4 \n"
- "pshufd $0xff,%%xmm5,%%xmm5 \n"
+ asm volatile(
+ "movdqu (%3),%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm7 \n"
- "movdqu " MEMACCESS(0) ",%%xmm6 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm7,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm6 \n"
- "psraw $0x6,%%xmm0 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm1 \n"
- "movdqu " MEMACCESS(0) ",%%xmm6 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm7 \n"
- "phaddsw %%xmm7,%%xmm6 \n"
- "psraw $0x6,%%xmm1 \n"
- "psraw $0x6,%%xmm6 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "punpcklbw %%xmm6,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "punpcklwd %%xmm1,%%xmm0 \n"
- "punpckhwd %%xmm1,%%xmm6 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x10(%0),%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm6,0x10(%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
- asm volatile (
- "movd %2,%%xmm2 \n"
- "movd %3,%%xmm3 \n"
- "movd %4,%%xmm4 \n"
- "pshuflw $0x40,%%xmm2,%%xmm2 \n"
- "pshufd $0x44,%%xmm2,%%xmm2 \n"
- "pshuflw $0x40,%%xmm3,%%xmm3 \n"
- "pshufd $0x44,%%xmm3,%%xmm3 \n"
- "pshuflw $0x40,%%xmm4,%%xmm4 \n"
- "pshufd $0x44,%%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "pslld $0x18,%%xmm6 \n"
+ asm volatile(
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "pmullw %%xmm3,%%xmm0 \n"
- "movdqu " MEMACCESS(0) ",%%xmm7 \n"
- "pmullw %%xmm3,%%xmm1 \n"
- "pand %%xmm6,%%xmm7 \n"
- "paddw %%xmm4,%%xmm0 \n"
- "paddw %%xmm4,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "por %%xmm7,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "sub $0x4,%1 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqu (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqu (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x4,%1 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBQUANTIZEROW_SSE2
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
- asm volatile (
- "movd %3,%%xmm2 \n"
- "punpcklbw %%xmm2,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm2 \n"
+ uint32_t value) {
+ asm volatile(
+ "movd %3,%%xmm2 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm2,%%xmm1 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2"
- );
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_ARGBSHADEROW_SSE2
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqu %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "pmulhuw %%xmm2,%%xmm0 \n"
- "pmulhuw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_ARGBMULTIPLYROW_SSE2
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ asm volatile(
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
- "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "vmovdqu (%1),%%ymm3 \n"
+ "lea 0x20(%1),%1 \n"
+ "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
+ "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
#if defined(__AVX2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+ ,
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
@@ -4264,121 +5244,113 @@
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_ARGBADDROW_SSE2
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "vmovdqu %%ymm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0"
- );
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpaddusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
}
#endif // HAS_ARGBADDROW_AVX2
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqu " MEMACCESS(1) ",%%xmm1 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "psubusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0", "xmm1"
- );
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_ARGBSUBTRACTROW_SSE2
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "vmovdqu %%ymm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x20,2) ",%2 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0"
- );
+ asm volatile(
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "lea 0x20(%0),%0 \n"
+ "vpsubusb (%1),%%ymm0,%%ymm0 \n"
+ "lea 0x20(%1),%1 \n"
+ "vmovdqu %%ymm0,(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0");
}
#endif // HAS_ARGBSUBTRACTROW_AVX2
@@ -4387,55 +5359,53 @@
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
- asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "sub %0,%3 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
- MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
- MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "sub $0x8,%4 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "movq 0x02(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x00(%0,%2,1),%%xmm2 \n"
+ "movq 0x02(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SOBELXROW_SSE2
@@ -4444,52 +5414,50 @@
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
- asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "psubw %%xmm1,%%xmm0 \n"
- "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
- MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "psubw %%xmm2,%%xmm1 \n"
- "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
- MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "psubw %%xmm3,%%xmm2 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm1,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "psubw %%xmm0,%%xmm1 \n"
- "pmaxsw %%xmm1,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "sub $0x8,%3 \n"
- "jg 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x00(%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x01(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x02(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,0x00(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x8,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SOBELYROW_SSE2
@@ -4499,83 +5467,79 @@
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "punpcklbw %%xmm0,%%xmm2 \n"
- "punpckhbw %%xmm0,%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "punpcklwd %%xmm2,%%xmm1 \n"
- "punpckhwd %%xmm2,%%xmm2 \n"
- "por %%xmm5,%%xmm1 \n"
- "por %%xmm5,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklwd %%xmm0,%%xmm3 \n"
- "punpckhwd %%xmm0,%%xmm0 \n"
- "por %%xmm5,%%xmm3 \n"
- "por %%xmm5,%%xmm0 \n"
- "movdqu %%xmm1," MEMACCESS(2) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
- "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
- "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "movdqu %%xmm1,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "movdqu %%xmm3,0x20(%2) \n"
+ "movdqu %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SOBELROW_SSE2
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "pslld $0x18,%%xmm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_SOBELTOPLANEROW_SSE2
@@ -4585,1168 +5549,1247 @@
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "sub %0,%1 \n"
- "pcmpeqb %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
- // 8 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "paddusb %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,%%xmm3 \n"
- "punpcklbw %%xmm5,%%xmm3 \n"
- "punpckhbw %%xmm5,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm4 \n"
- "punpcklbw %%xmm2,%%xmm4 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm6 \n"
- "punpcklwd %%xmm3,%%xmm6 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "punpcklwd %%xmm0,%%xmm7 \n"
- "punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm6," MEMACCESS(2) " \n"
- "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
- "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "sub $0x10,%3 \n"
- "jg 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 8 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm6,(%2) \n"
+ "movdqu %%xmm4,0x10(%2) \n"
+ "movdqu %%xmm7,0x20(%2) \n"
+ "movdqu %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_SOBELXYROW_SSE2
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width) {
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm1,%%xmm1 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "test $0xf,%1 \n"
- "jne 49f \n"
+ asm volatile(
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
- // 4 pixel loop.
- LABELALIGN
- "40: \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "punpckhwd %%xmm1,%%xmm3 \n"
- "punpckhbw %%xmm1,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "punpcklwd %%xmm1,%%xmm4 \n"
- "punpckhwd %%xmm1,%%xmm5 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(2) ",%%xmm2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
- "paddd %%xmm0,%%xmm3 \n"
- "paddd %%xmm4,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
- "paddd %%xmm0,%%xmm4 \n"
- "paddd %%xmm5,%%xmm0 \n"
- "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
- "lea " MEMLEA(0x40,2) ",%2 \n"
- "paddd %%xmm0,%%xmm5 \n"
- "movdqu %%xmm2," MEMACCESS(1) " \n"
- "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
- "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
- "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ // 4 pixel loop.
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqu 0x10(%2),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqu 0x20(%2),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqu 0x30(%2),%%xmm5 \n"
+ "lea 0x40(%2),%2 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "movdqu %%xmm3,0x10(%1) \n"
+ "movdqu %%xmm4,0x20(%1) \n"
+ "movdqu %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
- // 1 pixel loop.
- LABELALIGN
- "10: \n"
- "movd " MEMACCESS(0) ",%%xmm2 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "punpcklbw %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "movdqu " MEMACCESS(2) ",%%xmm2 \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
+ // 1 pixel loop.
+ LABELALIGN
+ "10: \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%2),%%xmm2 \n"
+ "lea 0x10(%2),%2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
- "19: \n"
- : "+r"(row), // %0
- "+r"(cumsum), // %1
- "+r"(previous_cumsum), // %2
- "+r"(width) // %3
- :
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft,
- const int32* botleft,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
int width,
int area,
- uint8* dst,
+ uint8_t* dst,
int count) {
- asm volatile (
- "movd %5,%%xmm5 \n"
- "cvtdq2ps %%xmm5,%%xmm5 \n"
- "rcpss %%xmm5,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
- "sub $0x4,%3 \n"
- "jl 49f \n"
- "cmpl $0x80,%5 \n"
- "ja 40f \n"
+ asm volatile(
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrld $0x10,%%xmm6 \n"
- "cvtdq2ps %%xmm6,%%xmm6 \n"
- "addps %%xmm6,%%xmm5 \n"
- "mulps %%xmm4,%%xmm5 \n"
- "cvtps2dq %%xmm5,%%xmm5 \n"
- "packssdw %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
- // 4 pixel small loop.
- LABELALIGN
- "4: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
- MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
- MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
- "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
- MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
- MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "pmulhuw %%xmm5,%%xmm0 \n"
- "pmulhuw %%xmm5,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jge 4b \n"
- "jmp 49f \n"
+ // 4 pixel small loop.
+ LABELALIGN
+ "4: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
- // 4 pixel loop \n"
- LABELALIGN
- "40: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
- "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
- MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
- MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
- "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
- MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
- MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm1,%%xmm1 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "mulps %%xmm4,%%xmm1 \n"
- "cvtdq2ps %%xmm2,%%xmm2 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "cvtps2dq %%xmm1,%%xmm1 \n"
- "cvtps2dq %%xmm2,%%xmm2 \n"
- "cvtps2dq %%xmm3,%%xmm3 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jge 40b \n"
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
- "49: \n"
- "add $0x3,%3 \n"
- "jl 19f \n"
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
- // 1 pixel loop \n"
- LABELALIGN
- "10: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "psubd " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "mulps %%xmm4,%%xmm0 \n"
- "cvtps2dq %%xmm0,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x4,2) ",%2 \n"
- "sub $0x1,%3 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(topleft), // %0
- "+r"(botleft), // %1
- "+r"(dst), // %2
- "+rm"(count) // %3
- : "r"((intptr_t)(width)), // %4
- "rm"(area) // %5
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "psubd 0x00(%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd 0x00(%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"((intptr_t)(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb,
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* src_dudv,
int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
- asm volatile (
- "movq " MEMACCESS(3) ",%%xmm2 \n"
- "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
- "shl $0x10,%1 \n"
- "add $0x4,%1 \n"
- "movd %1,%%xmm5 \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ asm volatile(
+ "movq (%3),%%xmm2 \n"
+ "movq 0x08(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
- "pshufd $0x44,%%xmm7,%%xmm7 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "addps %%xmm7,%%xmm0 \n"
- "movlhps %%xmm0,%%xmm2 \n"
- "movdqa %%xmm7,%%xmm4 \n"
- "addps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "addps %%xmm4,%%xmm3 \n"
- "addps %%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
- // 4 pixel loop \n"
- LABELALIGN
- "40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
- "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
- "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
- "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
- MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
- "punpckldq %%xmm6,%%xmm1 \n"
- "addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1," MEMACCESS(2) " \n"
- "movd %%xmm0,%k1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%k5 \n"
- MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
- MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
- "punpckldq %%xmm6,%%xmm0 \n"
- "addps %%xmm4,%%xmm3 \n"
- "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ // 4 pixel loop
+ LABELALIGN
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+ "movd %%xmm0,%k1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k5 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd 0x00(%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
- "49: \n"
- "add $0x3,%4 \n"
- "jl 19f \n"
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
- // 1 pixel loop \n"
- LABELALIGN
- "10: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "packssdw %%xmm0,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
- "addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%k1 \n"
- MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
- "movd %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x04,2) ",%2 \n"
- "sub $0x1,%4 \n"
- "jge 10b \n"
- "19: \n"
- : "+r"(src_argb), // %0
- "+r"(src_argb_stride_temp), // %1
- "+r"(dst_argb), // %2
- "+r"(src_dudv), // %3
- "+rm"(width), // %4
- "=&r"(temp) // %5
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 1 pixel loop
+ LABELALIGN
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%k1 \n"
+ "movd 0x00(%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x04(%2),%2 \n"
+ "sub $0x1,%4 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "=&r"(temp) // %5
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBAFFINEROW_SSE2
#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
- asm volatile (
- "sub %1,%0 \n"
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
+ asm volatile(
+ "sub %1,%0 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
- "movd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "movd %3,%%xmm5 \n"
- "punpcklbw %%xmm0,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm5 \n"
- "pshufd $0x0,%%xmm5,%%xmm5 \n"
- "mov $0x80808080,%%eax \n"
- "movd %%eax,%%xmm4 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm2)
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "psubb %%xmm4,%%xmm0 \n"
- "psubb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm5,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm3 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "pmaddubsw %%xmm1,%%xmm3 \n"
- "paddw %%xmm4,%%xmm2 \n"
- "paddw %%xmm4,%%xmm3 \n"
- "psrlw $0x8,%%xmm2 \n"
- "psrlw $0x8,%%xmm3 \n"
- "packuswb %%xmm3,%%xmm2 \n"
- MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "psubb %%xmm4,%%xmm0 \n"
+ "psubb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm3 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "pmaddubsw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm4,%%xmm2 \n"
+ "paddw %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,1,4,1,xmm1)
- "pavgb %%xmm1,%%xmm0 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x00(%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 100b \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu %%xmm0,0x00(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 100b \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+rm"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+rm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
- asm volatile (
- "cmp $0x0,%3 \n"
- "je 100f \n"
- "sub %1,%0 \n"
- "cmp $0x80,%3 \n"
- "je 50f \n"
+ asm volatile(
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "sub %1,%0 \n"
+ "cmp $0x80,%3 \n"
+ "je 50f \n"
- "vmovd %3,%%xmm0 \n"
- "neg %3 \n"
- "add $0x100,%3 \n"
- "vmovd %3,%%xmm5 \n"
- "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
- "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
- "vbroadcastss %%xmm5,%%ymm5 \n"
- "mov $0x80808080,%%eax \n"
- "vmovd %%eax,%%xmm4 \n"
- "vbroadcastss %%xmm4,%%ymm4 \n"
+ "vmovd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x100,%3 \n"
+ "vmovd %3,%%xmm5 \n"
+ "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
+ "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
+ "vbroadcastss %%xmm5,%%ymm5 \n"
+ "mov $0x80808080,%%eax \n"
+ "vmovd %%eax,%%xmm4 \n"
+ "vbroadcastss %%xmm4,%%ymm4 \n"
- // General purpose row blend.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
- MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
- "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
- "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
- "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
- "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
- "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "jmp 99f \n"
+ // General purpose row blend.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vmovdqu 0x00(%1,%4,1),%%ymm2 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
+ "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
+ "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
- // Blend 50 / 50.
- LABELALIGN
- "50: \n"
- "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
- VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
- MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 50b \n"
- "jmp 99f \n"
+ // Blend 50 / 50.
+ LABELALIGN
+ "50: \n"
+ "vmovdqu (%1),%%ymm0 \n"
+ "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,0x00(%1,%0,1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- LABELALIGN
- "100: \n"
- "rep movsb " MEMMOVESTRING(1,0) " \n"
- "jmp 999f \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ LABELALIGN
+ "100: \n"
+ "rep movsb \n"
+ "jmp 999f \n"
- "99: \n"
- "vzeroupper \n"
- "999: \n"
- : "+D"(dst_ptr), // %0
- "+S"(src_ptr), // %1
- "+cm"(dst_width), // %2
- "+r"(source_y_fraction) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", "eax", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
- );
+ "99: \n"
+ "vzeroupper \n"
+ "999: \n"
+ : "+D"(dst_ptr), // %0
+ "+S"(src_ptr), // %1
+ "+cm"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
}
#endif // HAS_INTERPOLATEROW_AVX2
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
- asm volatile (
- "movdqu " MEMACCESS(3) ",%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm5,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm5"
- );
+ asm volatile(
+
+ "movdqu (%3),%%xmm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_ARGBSHUFFLEROW_SSSE3
#ifdef HAS_ARGBSHUFFLEROW_AVX2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
- asm volatile (
- "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
- "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm5"
- );
+ asm volatile(
+
+ "vbroadcastf128 (%3),%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm5");
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width) {
- uintptr_t pixel_temp;
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
- "mov " MEMACCESS(4) ",%k2 \n"
- "cmp $0x3000102,%k2 \n"
- "je 3012f \n"
- "cmp $0x10203,%k2 \n"
- "je 123f \n"
- "cmp $0x30201,%k2 \n"
- "je 321f \n"
- "cmp $0x2010003,%k2 \n"
- "je 2103f \n"
-
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS(1) " \n"
- "movzb " MEMACCESS2(0x1,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x1,1) " \n"
- "movzb " MEMACCESS2(0x2,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x2,1) " \n"
- "movzb " MEMACCESS2(0x3,4) ",%2 \n"
- MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
- "mov %b2," MEMACCESS2(0x3,1) " \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- "lea " MEMLEA(0x4,1) ",%1 \n"
- "sub $0x1,%3 \n"
- "jg 1b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "123: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
- "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
- "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
- "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jg 123b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "321: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x39,%%xmm0,%%xmm0 \n"
- "pshuflw $0x39,%%xmm0,%%xmm0 \n"
- "pshufhw $0x39,%%xmm1,%%xmm1 \n"
- "pshuflw $0x39,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jg 321b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "2103: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0x93,%%xmm0,%%xmm0 \n"
- "pshuflw $0x93,%%xmm0,%%xmm0 \n"
- "pshufhw $0x93,%%xmm1,%%xmm1 \n"
- "pshuflw $0x93,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jg 2103b \n"
- "jmp 99f \n"
-
- LABELALIGN
- "3012: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpckhbw %%xmm5,%%xmm1 \n"
- "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
- "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
- "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
- "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%3 \n"
- "jg 3012b \n"
-
- "99: \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "=&d"(pixel_temp), // %2
- "+r"(width) // %3
- : "r"(shuffler) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm5"
- );
-}
-#endif // HAS_ARGBSHUFFLEROW_SSE2
-
#ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width) {
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(1) ",%%xmm2 \n"
- MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm2,%%xmm0 \n"
- "punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(3) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
- "lea " MEMLEA(0x20,3) ",%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3"
- );
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "add $0x10,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_I422TOYUY2ROW_SSE2
#ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width) {
- asm volatile (
- "sub %1,%2 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(1) ",%%xmm2 \n"
- MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "punpcklbw %%xmm0,%%xmm1 \n"
- "punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1," MEMACCESS(3) " \n"
- "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
- "lea " MEMLEA(0x20,3) ",%3 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_frame), // %3
- "+rm"(width) // %4
- :
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3"
- );
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq 0x00(%1,%2,1),%%xmm1 \n"
+ "add $0x8,%1 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "add $0x10,%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_I422TOUYVYROW_SSE2
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
+ "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
+ int width) {
+ asm volatile(
+
+ "sub %1,%2 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbw (%1),%%ymm1 \n"
+ "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
+ "add $0x10,%1 \n"
+ "vpsllw $0x8,%%ymm2,%%ymm2 \n"
+ "vpor %%ymm1,%%ymm2,%%ymm2 \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "add $0x20,%0 \n"
+ "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
+ "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
+ "vextractf128 $0x0,%%ymm1,(%3) \n"
+ "vextractf128 $0x0,%%ymm2,0x10(%3) \n"
+ "vextractf128 $0x1,%%ymm1,0x20(%3) \n"
+ "vextractf128 $0x1,%%ymm2,0x30(%3) \n"
+ "lea 0x40(%3),%3 \n"
+ "sub $0x20,%4 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif // HAS_I422TOUYVYROW_AVX2
+
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
- asm volatile (
- "pxor %%xmm3,%%xmm3 \n"
+ asm volatile(
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "punpcklbw %%xmm3,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm4 \n"
- "punpcklwd %%xmm3,%%xmm0 \n"
- "punpckhwd %%xmm3,%%xmm4 \n"
- "cvtdq2ps %%xmm0,%%xmm0 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
- "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
- "addps " MEMACCESS(3) ",%%xmm0 \n"
- "addps " MEMACCESS(3) ",%%xmm4 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "movdqa %%xmm5,%%xmm6 \n"
- "mulps %%xmm1,%%xmm2 \n"
- "mulps %%xmm5,%%xmm6 \n"
- "mulps %%xmm2,%%xmm1 \n"
- "mulps %%xmm6,%%xmm5 \n"
- "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
- "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
- "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
- "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
- "addps %%xmm2,%%xmm0 \n"
- "addps %%xmm6,%%xmm4 \n"
- "addps %%xmm1,%%xmm0 \n"
- "addps %%xmm5,%%xmm4 \n"
- "cvttps2dq %%xmm0,%%xmm0 \n"
- "cvttps2dq %%xmm4,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc"
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
- );
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps 0x10(%3),%%xmm0 \n"
+ "mulps 0x10(%3),%%xmm4 \n"
+ "addps (%3),%%xmm0 \n"
+ "addps (%3),%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps 0x20(%3),%%xmm2 \n"
+ "mulps 0x20(%3),%%xmm6 \n"
+ "mulps 0x30(%3),%%xmm1 \n"
+ "mulps 0x30(%3),%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
- asm volatile (
- "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
- "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
- "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
- "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+ asm volatile(
+ "vbroadcastf128 (%3),%%ymm4 \n"
+ "vbroadcastf128 0x10(%3),%%ymm5 \n"
+ "vbroadcastf128 0x20(%3),%%ymm6 \n"
+ "vbroadcastf128 0x30(%3),%%ymm7 \n"
- // 2 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
- "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
- "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
- "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
- "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
- "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
- "vcvttps2dq %%ymm0,%%ymm0 \n"
- "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
- "vmovq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x2,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(poly) // %3
- : "memory", "cc",
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ // 2 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels
+ "lea 0x8(%0),%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X *
+ // X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "vmovq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x2,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
#ifdef HAS_HALFFLOATROW_SSE2
static float kScaleBias = 1.9259299444e-34f;
-void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile (
- "pshufd $0x0,%3,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
+void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "movd %3,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
- "add $0x10,%0 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
- "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
- "punpckhwd %%xmm5,%%xmm3 \n"
- "cvtdq2ps %%xmm3,%%xmm3 \n"
- "mulps %%xmm4,%%xmm2 \n"
- "mulps %%xmm4,%%xmm3 \n"
- "psrld $0xd,%%xmm2 \n"
- "psrld $0xd,%%xmm3 \n"
- "packssdw %%xmm3,%%xmm2 \n"
- MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "x"(scale * kScaleBias) // %3
- : "memory", "cc",
- "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm2 \n" // 8 shorts
+ "add $0x10,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
+ "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
+ "punpckhwd %%xmm5,%%xmm3 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "psrld $0xd,%%xmm2 \n"
+ "psrld $0xd,%%xmm3 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm2,-0x10(%0,%1,1) \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "m"(scale) // %3
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
-void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile (
- "vbroadcastss %3, %%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- "sub %0,%1 \n"
+void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ scale *= kScaleBias;
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ "sub %0,%1 \n"
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
- "add $0x20,%0 \n"
- "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
- "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vpsrld $0xd,%%ymm3,%%ymm3 \n"
- "vpsrld $0xd,%%ymm2,%%ymm2 \n"
- "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
- MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
- "sub $0x10,%2 \n"
- "jg 1b \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm2 \n" // 16 shorts
+ "add $0x20,%0 \n"
+ "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
+ "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vpsrld $0xd,%%ymm3,%%ymm3 \n"
+ "vpsrld $0xd,%%ymm2,%%ymm2 \n"
+ "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
+ "vmovdqu %%ymm2,-0x20(%0,%1,1) \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "x"(scale * kScaleBias) // %3
- : "memory", "cc",
- "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile (
- "vbroadcastss %3, %%ymm4 \n"
- "sub %0,%1 \n"
+void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "vbroadcastss %3, %%ymm4 \n"
+ "sub %0,%1 \n"
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
- "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
- "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
- "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
- MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
- MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
- "add $0x20,%0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "x"(scale) // %3
- : "memory", "cc",
- "xmm2", "xmm3", "xmm4"
- );
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
+ "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+#if defined(__x86_64__)
+ : "x"(scale) // %3
+#else
+ : "m"(scale) // %3
+#endif
+ : "memory", "cc", "xmm2", "xmm3", "xmm4");
}
#endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_HALFFLOATROW_F16C
-void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
- asm volatile (
- "sub %0,%1 \n"
- // 16 pixel loop.
- LABELALIGN
- "1: \n"
- "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
- "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
- "vcvtdq2ps %%ymm2,%%ymm2 \n"
- "vcvtdq2ps %%ymm3,%%ymm3 \n"
- "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
- "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
- MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
- MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
- "add $0x20,%0 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "memory", "cc",
- "xmm2", "xmm3"
- );
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+ asm volatile(
+ "sub %0,%1 \n"
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints
+ "vpmovzxwd 0x10(%0),%%ymm3 \n"
+ "vcvtdq2ps %%ymm2,%%ymm2 \n"
+ "vcvtdq2ps %%ymm3,%%ymm3 \n"
+ "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
+ "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
+ "vmovdqu %%xmm2,0x00(%0,%1,1) \n"
+ "vmovdqu %%xmm3,0x10(%0,%1,1) \n"
+ "add $0x20,%0 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc", "xmm2", "xmm3");
}
#endif // HAS_HALFFLOATROW_F16C
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb,
- const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
int width) {
uintptr_t pixel_temp;
- asm volatile (
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(0) ",%1 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x4,0) " \n"
- "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
- MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x3,0) " \n"
- "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
- MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x2,0) " \n"
- "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
- MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x1,0) " \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "movzb -0x1(%0),%1 \n"
+ "movzb 0x03(%3,%1,4),%1 \n"
+ "mov %b1,-0x1(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
}
#endif // HAS_ARGBCOLORTABLEROW_X86
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
+ int width) {
uintptr_t pixel_temp;
- asm volatile (
- // 1 pixel loop.
- LABELALIGN
- "1: \n"
- "movzb " MEMACCESS(0) ",%1 \n"
- "lea " MEMLEA(0x4,0) ",%0 \n"
- MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x4,0) " \n"
- "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
- MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x3,0) " \n"
- "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
- MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
- "mov %b1," MEMACCESS2(-0x2,0) " \n"
- "dec %2 \n"
- "jg 1b \n"
- : "+r"(dst_argb), // %0
- "=&d"(pixel_temp), // %1
- "+r"(width) // %2
- : "r"(table_argb) // %3
- : "memory", "cc");
+ asm volatile(
+ // 1 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movzb (%0),%1 \n"
+ "lea 0x4(%0),%0 \n"
+ "movzb 0x00(%3,%1,4),%1 \n"
+ "mov %b1,-0x4(%0) \n"
+ "movzb -0x3(%0),%1 \n"
+ "movzb 0x01(%3,%1,4),%1 \n"
+ "mov %b1,-0x3(%0) \n"
+ "movzb -0x2(%0),%1 \n"
+ "movzb 0x02(%3,%1,4),%1 \n"
+ "mov %b1,-0x2(%0) \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "=&d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
}
#endif // HAS_RGBCOLORTABLEROW_X86
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff) {
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
uintptr_t pixel_temp;
uintptr_t table_temp;
- asm volatile (
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psllw $0x8,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
- // 4 pixel loop.
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(2) ",%%xmm0 \n"
- "pmaddubsw %%xmm3,%%xmm0 \n"
- "phaddw %%xmm0,%%xmm0 \n"
- "pand %%xmm4,%%xmm0 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ // 4 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%2),%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb " MEMACCESS(2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS(3) " \n"
- "movzb " MEMACCESS2(0x1,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x1,3) " \n"
- "movzb " MEMACCESS2(0x2,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x2,3) " \n"
- "movzb " MEMACCESS2(0x3,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0x3,3) " \n"
+ "movzb (%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,(%3) \n"
+ "movzb 0x1(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x1(%3) \n"
+ "movzb 0x2(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x2(%3) \n"
+ "movzb 0x3(%2),%0 \n"
+ "mov %b0,0x3(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb " MEMACCESS2(0x4,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x4,3) " \n"
- "movzb " MEMACCESS2(0x5,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x5,3) " \n"
- "movzb " MEMACCESS2(0x6,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x6,3) " \n"
- "movzb " MEMACCESS2(0x7,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0x7,3) " \n"
+ "movzb 0x4(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x4(%3) \n"
+ "movzb 0x5(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x5(%3) \n"
+ "movzb 0x6(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x6(%3) \n"
+ "movzb 0x7(%2),%0 \n"
+ "mov %b0,0x7(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
- "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movzb " MEMACCESS2(0x8,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x8,3) " \n"
- "movzb " MEMACCESS2(0x9,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0x9,3) " \n"
- "movzb " MEMACCESS2(0xa,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xa,3) " \n"
- "movzb " MEMACCESS2(0xb,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0xb,3) " \n"
+ "movzb 0x8(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x8(%3) \n"
+ "movzb 0x9(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0x9(%3) \n"
+ "movzb 0xa(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xa(%3) \n"
+ "movzb 0xb(%2),%0 \n"
+ "mov %b0,0xb(%3) \n"
- "movd %%xmm0,%k1 \n" // 32 bit offset
- "add %5,%1 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
- "movzb " MEMACCESS2(0xc,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xc,3) " \n"
- "movzb " MEMACCESS2(0xd,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xd,3) " \n"
- "movzb " MEMACCESS2(0xe,2) ",%0 \n"
- MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
- "mov %b0," MEMACCESS2(0xe,3) " \n"
- "movzb " MEMACCESS2(0xf,2) ",%0 \n"
- "mov %b0," MEMACCESS2(0xf,3) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "lea " MEMLEA(0x10,3) ",%3 \n"
- "sub $0x4,%4 \n"
- "jg 1b \n"
- : "=&d"(pixel_temp), // %0
- "=&a"(table_temp), // %1
- "+r"(src_argb), // %2
- "+r"(dst_argb), // %3
- "+rm"(width) // %4
- : "r"(luma), // %5
- "rm"(lumacoeff) // %6
- : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
- );
+ "movzb 0xc(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xc(%3) \n"
+ "movzb 0xd(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xd(%3) \n"
+ "movzb 0xe(%2),%0 \n"
+ "movzb 0x00(%1,%0,1),%0 \n"
+ "mov %b0,0xe(%3) \n"
+ "movzb 0xf(%2),%0 \n"
+ "mov %b0,0xf(%3) \n"
+ "lea 0x10(%2),%2 \n"
+ "lea 0x10(%3),%3 \n"
+ "sub $0x4,%4 \n"
+ "jg 1b \n"
+ : "=&d"(pixel_temp), // %0
+ "=&a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+ 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+ 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+ 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+ 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+ 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+ 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+ 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+ 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+ 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+ 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+ 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+ 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+ 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+ 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+ 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+ 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ uint8_t* src_y_ptr;
+ uint64_t src_offset = 0;
+ uint64_t width64;
+
+ width64 = width;
+ src_y_ptr = (uint8_t*)src_y;
+
+ asm volatile(
+ "vmovdqu %5, %%ymm0 \n" // init blend value
+ "vmovdqu %6, %%ymm1 \n" // init blend value
+ "vmovdqu %7, %%ymm2 \n" // init blend value
+ // "sub $0x20, %3 \n" //sub 32 from width for final loop
+
+ LABELALIGN
+ "1: \n" // label 1
+ "vmovdqu (%0,%4), %%ymm3 \n" // src_y
+ "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
+ "vmovdqu (%1), %%ymm5 \n" // src_uv
+ "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
+ "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
+ // shuf
+ "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
+ // shuf
+ "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
+ "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
+ // shuf
+ "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
+ "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
+ "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
+ "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
+ "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
+ "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
+ "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
+ "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
+ "add $0x20, %4 \n" // add to src buffer
+ // ptr
+ "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
+ "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
+ "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
+ "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
+ "add $0x60,%2 \n" // add to dst buffer
+ // ptr
+ // "cmp %3, %4 \n" //(width64 -
+ // 32 bytes) and src_offset
+ "sub $0x20,%3 \n" // 32 pixels per loop
+ "jg 1b \n"
+ "vzeroupper \n" // sse-avx2
+ // transistions
+
+ : "+r"(src_y), //%0
+ "+r"(src_vu), //%1
+ "+r"(dst_yuv24), //%2
+ "+r"(width64), //%3
+ "+r"(src_offset) //%4
+ : "m"(kBLEND0), //%5
+ "m"(kBLEND1), //%6
+ "m"(kBLEND2), //%7
+ "m"(kSHUF0), //%8
+ "m"(kSHUF1), //%9
+ "m"(kSHUF2), //%10
+ "m"(kSHUF3), //%11
+ "m"(kSHUF4), //%12
+ "m"(kSHUF5) //%13
+ : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+ "xmm13", "xmm14", "xmm15");
+}
+#endif // HAS_NV21TOYUV24ROW_AVX2
+
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
diff --git a/files/source/row_mmi.cc b/files/source/row_mmi.cc
new file mode 100644
index 0000000..d8726d0
--- /dev/null
+++ b/files/source/row_mmi.cc
@@ -0,0 +1,6042 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include "libyuv/row.h"
+
+#include <string.h> // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+ const uint64_t mask = 0xff000000ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask] \n\t"
+ "or %[src1], %[src1], %[mask] \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask] \n\t"
+ "or %[src1], %[src1], %[mask] \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ uint64_t src0, src1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0xff000000ULL;
+ const uint64_t mask2 = 0xc6;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask1] \n\t"
+ "punpcklbh %[src0], %[src0], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[mask1] \n\t"
+ "punpcklbh %[src1], %[src1], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask2] \n\t"
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t"
+
+ "or %[src0], %[src0], %[mask1] \n\t"
+ "punpcklbh %[src0], %[src0], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask2] \n\t"
+ "or %[src1], %[src1], %[mask1] \n\t"
+ "punpcklbh %[src1], %[src1], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask2] \n\t"
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width)
+ : "memory");
+}
+
+void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+ uint64_t mask0 = 0xc6;
+ uint64_t mask1 = 0x6c;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t"
+ "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t"
+ "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t"
+
+ "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
+ "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
+ "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
+ "punpcklbh %[src1], %[src1], %[zero] \n\t"
+ "pextrh %[ftmp2], %[ftmp0], %[three] \n\t"
+ "pextrh %[ftmp3], %[ftmp1], %[one] \n\t"
+ "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t"
+ "pextrh %[ftmp3], %[ftmp1], %[two] \n\t"
+ "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "pextrh %[ftmp2], %[src1], %[zero] \n\t"
+ "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t"
+ "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "packushb %[src1], %[src1], %[zero] \n\t"
+
+ "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t"
+
+ "daddiu %[src_raw], %[src_raw], 0x0c \n\t"
+ "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+ [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3])
+ : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width),
+ [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+ [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03)
+ : "memory");
+}
+
+void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[5];
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[c1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4])
+ : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02),
+ [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[6];
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ uint64_t c4 = 0x0001000100010001;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psrlh %[a], %[src1], %[seven] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "xor %[a], %[a], %[c1] \n\t"
+ "paddb %[a], %[a], %[c4] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[a] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+ : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05),
+ [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+ : "memory");
+}
+
+void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t ftmp[6];
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psrlh %[a], %[src1], %[four] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "psllh %[src0], %[a], %[four] \n\t"
+ "or %[a], %[src0], %[a] \n\t"
+ "packushb %[b], %[b], %[r] \n\t"
+ "packushb %[g], %[g], %[a] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t"
+ "punpckhhw %[r], %[src0], %[src1] \n\t"
+ "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t"
+ "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t"
+ "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]),
+ [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5])
+ : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08),
+ [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t"
+
+ "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t"
+ "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t"
+ "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+ uint64_t mask0 = 0xc6;
+ uint64_t mask1 = 0x18;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[ftmp0], %[src0], %[zero] \n\t"
+ "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t"
+ "punpckhbh %[ftmp1], %[src0], %[zero] \n\t"
+ "punpcklbh %[ftmp2], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+
+ "pextrh %[src0], %[ftmp1], %[two] \n\t"
+ "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t"
+ "pshufh %[ftmp1], %[ftmp1], %[one] \n\t"
+
+ "pextrh %[src0], %[ftmp2], %[two] \n\t"
+ "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t"
+ "pextrh %[src0], %[ftmp2], %[one] \n\t"
+ "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t"
+ "pextrh %[src0], %[ftmp2], %[zero] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "pinsrh_0 %[src1], %[src1], %[src0] \n\t"
+ "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t"
+ "packushb %[src1], %[src1], %[zero] \n\t"
+
+ "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t"
+ "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t"
+ "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]),
+ [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00),
+ [one] "f"(0x01), [two] "f"(0x02)
+ : "memory");
+}
+
+void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[two] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[eleven] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05),
+ [eleven] "f"(0x0b)
+ : "memory");
+}
+
+// dither4 is a row of 4 values from 4x4 dither matrix.
+// The 4x4 matrix contains values to increase RGB. When converting to
+// fewer bits (565) this provides an ordered dither.
+// The order in the 4x4 matrix in first byte is upper left.
+// The 4 values are passed as an int, then referenced as an array, so
+// endian will not affect order of the original matrix. But the dither4
+// will containing the first pixel in the lower byte for little endian
+// or the upper byte for big endian.
+void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[3];
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+
+ __asm__ volatile(
+ "punpcklbh %[dither], %[dither], %[zero] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+
+ "paddh %[b], %[b], %[dither] \n\t"
+ "paddh %[g], %[g], %[dither] \n\t"
+ "paddh %[r], %[r], %[dither] \n\t"
+ "pcmpgth %[src0], %[b], %[c0] \n\t"
+ "or %[src0], %[src0], %[b] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "pcmpgth %[src0], %[g], %[c0] \n\t"
+ "or %[src0], %[src0], %[g] \n\t"
+ "and %[g], %[src0], %[c0] \n\t"
+ "pcmpgth %[src0], %[r], %[c0] \n\t"
+ "or %[src0], %[src0], %[r] \n\t"
+ "and %[r], %[src0], %[c0] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[two] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[eleven] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02),
+ [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b)
+ : "memory");
+}
+
+void ARGBToARGB1555Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+ "punpckhbh %[a], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[three] \n\t"
+ "psrlh %[g], %[g], %[three] \n\t"
+ "psrlh %[r], %[r], %[three] \n\t"
+ "psrlh %[a], %[a], %[seven] \n\t"
+
+ "psllh %[g], %[g], %[five] \n\t"
+ "psllh %[r], %[r], %[ten] \n\t"
+ "psllh %[a], %[a], %[fifteen] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+ "or %[b], %[b], %[a] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05),
+ [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f)
+ : "memory");
+}
+
+void ARGBToARGB4444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t src0, src1;
+ uint64_t ftmp[4];
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t"
+
+ "punpcklbh %[b], %[src0], %[src1] \n\t"
+ "punpckhbh %[g], %[src0], %[src1] \n\t"
+ "punpcklbh %[src0], %[b], %[g] \n\t"
+ "punpckhbh %[src1], %[b], %[g] \n\t"
+ "punpcklbh %[b], %[src0], %[zero] \n\t"
+ "punpckhbh %[g], %[src0], %[zero] \n\t"
+ "punpcklbh %[r], %[src1], %[zero] \n\t"
+ "punpckhbh %[a], %[src1], %[zero] \n\t"
+
+ "psrlh %[b], %[b], %[four] \n\t"
+ "psrlh %[g], %[g], %[four] \n\t"
+ "psrlh %[r], %[r], %[four] \n\t"
+ "psrlh %[a], %[a], %[four] \n\t"
+
+ "psllh %[g], %[g], %[four] \n\t"
+ "psllh %[r], %[r], %[eight] \n\t"
+ "psllh %[a], %[a], %[twelve] \n\t"
+ "or %[b], %[b], %[g] \n\t"
+ "or %[b], %[b], %[r] \n\t"
+ "or %[b], %[b], %[a] \n\t"
+
+ "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t"
+ "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x10 \n\t"
+ "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x04 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]),
+ [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3])
+ : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width),
+ [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08),
+ [twelve] "f"(0x0c)
+ : "memory");
+}
+
+void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001004200810019;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void ARGBToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0026004a00700002;
+ const uint64_t mask_v = 0x00020070005e0012;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0019008100420001;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x00020070004a0026;
+ const uint64_t mask_v = 0x0012005e00700002;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsrl %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_0 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001001900810042;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x00020070004a0026;
+ const uint64_t mask_v = 0x0012005e00700002;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
+ "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
+ "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
+ "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
+ "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0042008100190001;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0026004a00700002;
+ const uint64_t mask_v = 0x00020070005e0012;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t"
+ "dsrl %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t"
+ "dsrl %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t"
+ "dsrl %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t"
+ "dsrl %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_0 %[src_lo], %[src0], %[value] \n\t"
+ "dsrl %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001004200810019;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0026004a00700002;
+ const uint64_t mask_v = 0x00020070005e0012;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest0, dest1, dest2, dest3;
+ const uint64_t value = 0x1080;
+ const uint64_t mask = 0x0001001900810042;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[dest0], %[src] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[dest1], %[src] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[dest2], %[src] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t"
+ "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "dsll %[src], %[src], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src], %[zero] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[dest3], %[src] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3)
+ : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width),
+ [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08),
+ [zero] "f"(0x00)
+ : "memory");
+}
+
+void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x00020070004a0026;
+ const uint64_t mask_v = 0x0012005e00700002;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t"
+ "dsll %[dest0_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src1], %[src0] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src0], %[src1] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t"
+ "dsll %[dest1_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src1], %[src0] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src0], %[src1] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t"
+ "dsll %[dest2_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src1], %[src0] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src0], %[src1] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t"
+ "dsll %[dest3_v], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "dsll %[src0], %[src0], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "paddh %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpcklbh %[src_lo], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_lo] \n\t"
+ "dsll %[src1], %[src1], %[eight] \n\t"
+ "punpckhbh %[src_hi], %[src1], %[zero] \n\t"
+ "paddh %[src0], %[src0], %[src_hi] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "pinsrh_3 %[src_lo], %[src0], %[value] \n\t"
+ "dsll %[src_hi], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src1], %[src0] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src0], %[src1] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest0, dest1, dest2, dest3;
+ uint64_t tmp0, tmp1;
+ const uint64_t shift = 0x07;
+ const uint64_t value = 0x0040;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x00010026004B000FULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest2], %[dest2], %[shift] \n\t"
+
+ "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t"
+ "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest3], %[dest3], %[shift] \n\t"
+
+ "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
+ "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
+ "packushb %[dest], %[tmp0], %[tmp1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+ [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0),
+ [tmp1] "=&f"(tmp1)
+ : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value),
+ [width] "r"(width)
+ : "memory");
+}
+
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+ int src_stride_rgb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src_rgb1;
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x002b0054007f0002;
+ const uint64_t mask_v = 0x0002007f006b0014;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest0_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest1_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest2_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[dest3_u], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+
+ "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t"
+ "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t"
+ "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t"
+ "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "punpcklbh %[src0], %[src1], %[zero] \n\t"
+ "punpckhbh %[src1], %[src1], %[zero] \n\t"
+ "pavgh %[src0], %[src_lo], %[src0] \n\t"
+ "pavgh %[src1], %[src_hi], %[src1] \n\t"
+ "pavgh %[src0], %[src0], %[src1] \n\t"
+ "dsll %[src_lo], %[src0], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src0], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]),
+ [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]),
+ [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]),
+ [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]),
+ [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11])
+ : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value),
+ [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02),
+ [sixteen] "f"(0x10)
+ : "memory");
+}
+
+void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ uint64_t ftmp[11];
+ const uint64_t value = 0x1080108010801080;
+ const uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psrlh %[r], %[src1], %[three] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[two] \n\t"
+ "psrlh %[src1], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05),
+ [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04)
+ : "memory");
+}
+
+void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t ftmp[11];
+ const uint64_t value = 0x1080108010801080;
+ const uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g], %[src1], %[c2] \n\t"
+ "psllh %[g], %[g], %[three] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "and %[r], %[src1], %[c3] \n\t"
+ "psrlh %[r], %[r], %[two] \n\t"
+ "psllh %[src0], %[b], %[three] \n\t"
+ "psrlh %[src1], %[b], %[two] \n\t"
+ "or %[b], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[g], %[three] \n\t"
+ "psrlh %[src1], %[g], %[two] \n\t"
+ "or %[g], %[src0], %[src1] \n\t"
+ "psllh %[src0], %[r], %[three] \n\t"
+ "psrlh %[src1], %[r], %[two] \n\t"
+ "or %[r], %[src0], %[src1] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y),
+ [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+ [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08),
+ [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07)
+ : "memory");
+}
+
+void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t ftmp[11];
+ uint64_t value = 0x1080108010801080;
+ uint64_t mask = 0x0001004200810019;
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest0], %[src0], %[src1] \n\t"
+ "psrlw %[dest0], %[dest0], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest1], %[src0], %[src1] \n\t"
+ "psrlw %[dest1], %[dest1], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
+ "psrlh %[src1], %[src0], %[eight] \n\t"
+ "and %[b], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g], %[src0], %[four] \n\t"
+ "and %[r], %[src1], %[c0] \n\t"
+ "psllh %[src0], %[b], %[four] \n\t"
+ "or %[b], %[src0], %[b] \n\t"
+ "psllh %[src0], %[g], %[four] \n\t"
+ "or %[g], %[src0], %[g] \n\t"
+ "psllh %[src0], %[r], %[four] \n\t"
+ "or %[r], %[src0], %[r] \n\t"
+ "punpcklhw %[src0], %[b], %[r] \n\t"
+ "punpcklhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest2], %[src0], %[src1] \n\t"
+ "psrlw %[dest2], %[dest2], %[eight] \n\t"
+
+ "punpckhhw %[src0], %[b], %[r] \n\t"
+ "punpckhhw %[src1], %[g], %[value] \n\t"
+ "punpcklhw %[src_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[src_hi], %[src0], %[src1] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t"
+ "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t"
+ "paddw %[dest3], %[src0], %[src1] \n\t"
+ "psrlw %[dest3], %[dest3], %[eight] \n\t"
+
+ "packsswh %[src_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[src_hi], %[dest2], %[dest3] \n\t"
+ "packushb %[dest0], %[src_lo], %[src_hi] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t"
+
+ "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t"
+ "daddiu %[dst_y], %[dst_y], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]),
+ [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]),
+ [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10])
+ : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y),
+ [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0),
+ [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04)
+ : "memory");
+}
+
+void RGB565ToUVRow_MMI(const uint8_t* src_rgb565,
+ int src_stride_rgb565,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[13];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0007000700070007;
+ __asm__ volatile(
+ "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest0_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest0_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest1_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest1_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest2_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest2_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest2_v], %[src0], %[c2] \n\t"
+ "psllh %[dest2_v], %[dest2_v], %[three] \n\t"
+ "or %[dest2_v], %[src1], %[dest2_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest2_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest3_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "psrlh %[r0], %[dest3_u], %[three] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest3_v], %[src0], %[c2] \n\t"
+ "psllh %[dest3_v], %[dest3_v], %[three] \n\t"
+ "or %[dest3_v], %[src1], %[dest3_v] \n\t"
+ "psrlh %[src0], %[src0], %[three] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest3_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t"
+ "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+ [dest3_v] "=&f"(ftmp[12])
+ : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2),
+ [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+ [one] "f"(0x01)
+ : "memory");
+}
+
+void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555,
+ int src_stride_argb1555,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[11];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x001f001f001f001f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t c2 = 0x0003000300030003;
+ uint64_t c3 = 0x007c007c007c007c;
+ __asm__ volatile(
+ "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest0_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest0_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest1_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest1_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest2_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest2_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest0_v], %[src0], %[c2] \n\t"
+ "psllh %[dest0_v], %[dest0_v], %[three] \n\t"
+ "or %[dest0_v], %[src1], %[dest0_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[src0], %[src0], %[five] \n\t"
+ "and %[g0], %[dest3_u], %[c2] \n\t"
+ "psllh %[g0], %[g0], %[three] \n\t"
+ "or %[g0], %[src0], %[g0] \n\t"
+ "and %[r0], %[dest3_u], %[c3] \n\t"
+ "psrlh %[r0], %[r0], %[two] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[src1], %[src1], %[five] \n\t"
+ "and %[dest1_v], %[src0], %[c2] \n\t"
+ "psllh %[dest1_v], %[dest1_v], %[three] \n\t"
+ "or %[dest1_v], %[src1], %[dest1_v] \n\t"
+ "and %[src0], %[src0], %[c3] \n\t"
+ "psrlh %[src0], %[src0], %[two] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[six] \n\t"
+ "psllh %[r0], %[src0], %[one] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[six] \n\t"
+ "psllh %[g0], %[g0], %[one] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[dest0_u], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t"
+ "packushb %[dest0_v], %[dest1_u], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t"
+ "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10])
+ : [src_argb1555] "r"(src_argb1555),
+ [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u),
+ [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+ [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3),
+ [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03),
+ [two] "f"(0x02), [one] "f"(0x01)
+ : "memory");
+}
+
+void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444,
+ int src_stride_argb4444,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[13];
+ uint64_t value = 0x2020202020202020;
+ uint64_t mask_u = 0x0026004a00700002;
+ uint64_t mask_v = 0x00020070005e0012;
+ uint64_t mask = 0x93;
+ uint64_t c0 = 0x000f000f000f000f;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ __asm__ volatile(
+ "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t"
+ "psrlh %[dest0_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest0_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest0_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest0_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest0_u] \n\t"
+ "paddh %[g0], %[g0], %[dest0_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest0_u], %[dest0_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest0_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest0_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest0_u], %[dest0_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest0_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[b0] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[g0] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t"
+ "psrlh %[dest1_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest1_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest1_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest1_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest1_u] \n\t"
+ "paddh %[g0], %[g0], %[dest1_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest1_u], %[dest1_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest1_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest1_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest1_u], %[dest1_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest1_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[b0] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[g0] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t"
+ "psrlh %[dest2_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest2_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest2_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest2_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest2_u] \n\t"
+ "paddh %[g0], %[g0], %[dest2_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest2_u], %[dest2_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest2_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest2_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest2_u], %[dest2_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest2_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[b0] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[g0] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t"
+ "psrlh %[dest3_u], %[src0], %[eight] \n\t"
+ "and %[b0], %[src0], %[c0] \n\t"
+ "and %[src0], %[src0], %[c1] \n\t"
+ "psrlh %[g0], %[src0], %[four] \n\t"
+ "and %[r0], %[dest3_u], %[c0] \n\t"
+ "psrlh %[src0], %[src1], %[eight] \n\t"
+ "and %[dest3_u], %[src1], %[c0] \n\t"
+ "and %[src1], %[src1], %[c1] \n\t"
+ "psrlh %[dest3_v], %[src1], %[four] \n\t"
+ "and %[src0], %[src0], %[c0] \n\t"
+ "paddh %[b0], %[b0], %[dest3_u] \n\t"
+ "paddh %[g0], %[g0], %[dest3_v] \n\t"
+ "paddh %[r0], %[r0], %[src0] \n\t"
+ "punpcklhw %[src0], %[b0], %[r0] \n\t"
+ "punpckhhw %[src1], %[b0], %[r0] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[src0], %[dest3_u], %[dest3_v] \n\t"
+ "psrlh %[b0], %[src0], %[four] \n\t"
+ "psllh %[r0], %[src0], %[two] \n\t"
+ "or %[b0], %[b0], %[r0] \n\t"
+ "psrlh %[r0], %[g0], %[four] \n\t"
+ "psllh %[g0], %[g0], %[two] \n\t"
+ "or %[g0], %[g0], %[r0] \n\t"
+ "punpcklhw %[src0], %[g0], %[value] \n\t"
+ "punpckhhw %[src1], %[g0], %[value] \n\t"
+ "punpcklwd %[dest3_u], %[src0], %[src1] \n\t"
+ "punpckhwd %[dest3_v], %[src0], %[src1] \n\t"
+ "paddh %[g0], %[dest3_u], %[dest3_v] \n\t"
+ "punpcklhw %[src0], %[b0], %[g0] \n\t"
+ "punpckhhw %[src1], %[b0], %[g0] \n\t"
+
+ "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t"
+ "pshufh %[dest3_u], %[src0], %[mask] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[g0], %[src1], %[mask_v] \n\t"
+ "pshufh %[b0], %[src1], %[mask] \n\t"
+ "pmaddhw %[b0], %[b0], %[mask_u] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[b0] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[b0] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[g0] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[g0] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t"
+ "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x10 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]),
+ [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]),
+ [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]),
+ [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]),
+ [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]),
+ [dest3_v] "=&f"(ftmp[12])
+ : [src_argb4444] "r"(src_argb4444),
+ [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u),
+ [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value),
+ [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u),
+ [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04),
+ [two] "f"(0x02)
+ : "memory");
+}
+
+void ARGBToUV444Row_MMI(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t ftmp[12];
+ const uint64_t value = 0x4040;
+ const uint64_t mask_u = 0x0026004a00700002;
+ const uint64_t mask_v = 0x00020070005e0012;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t"
+ "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t"
+ "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t"
+
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t"
+ "psubw %[dest0_u], %[src0], %[src1] \n\t"
+ "psraw %[dest0_u], %[dest0_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t"
+ "psubw %[dest0_v], %[src1], %[src0] \n\t"
+ "psraw %[dest0_v], %[dest0_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t"
+ "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t"
+ "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t"
+ "psubw %[dest1_u], %[src0], %[src1] \n\t"
+ "psraw %[dest1_u], %[dest1_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t"
+ "psubw %[dest1_v], %[src1], %[src0] \n\t"
+ "psraw %[dest1_v], %[dest1_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t"
+ "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t"
+ "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t"
+ "psubw %[dest2_u], %[src0], %[src1] \n\t"
+ "psraw %[dest2_u], %[dest2_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t"
+ "psubw %[dest2_v], %[src1], %[src0] \n\t"
+ "psraw %[dest2_v], %[dest2_v], %[eight] \n\t"
+
+ "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t"
+ "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t"
+ "punpcklbh %[src_lo], %[src0], %[zero] \n\t"
+ "punpckhbh %[src_hi], %[src0], %[zero] \n\t"
+ "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t"
+ "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t"
+ "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t"
+ "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t"
+ "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t"
+ "dsll %[src_lo], %[src_hi], %[sixteen] \n\t"
+ "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t"
+ "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t"
+ "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t"
+
+ "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t"
+ "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t"
+ "psubw %[dest3_u], %[src0], %[src1] \n\t"
+ "psraw %[dest3_u], %[dest3_u], %[eight] \n\t"
+ "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t"
+ "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t"
+ "psubw %[dest3_v], %[src1], %[src0] \n\t"
+ "psraw %[dest3_v], %[dest3_v], %[eight] \n\t"
+
+ "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t"
+ "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t"
+ "packushb %[dest0_u], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t"
+ "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t"
+
+ "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t"
+ "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t"
+ "packushb %[dest0_v], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t"
+ "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t"
+
+ "daddiu %[src_argb], %[src_argb], 0x20 \n\t"
+ "daddiu %[dst_u], %[dst_u], 0x08 \n\t"
+ "daddiu %[dst_v], %[dst_v], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bgtz %[width], 1b \n\t"
+ : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]),
+ [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]),
+ [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]),
+ [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]),
+ [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]),
+ [dest3_v] "=&f"(ftmp[11])
+ : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v),
+ [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10),
+ [eight] "f"(0x08)
+ : "memory");
+}
+
+void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi;
+ uint64_t tmp0, tmp1;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x01;
+ const uint64_t mask2 = 0x00400026004B000FULL;
+ const uint64_t mask3 = 0xFF000000FF000000ULL;
+ const uint64_t mask4 = ~mask3;
+ const uint64_t shift = 0x07;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "and %[src37], %[src], %[mask3] \n\t"
+
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t"
+ "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t"
+
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+ "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t"
+ "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t"
+ "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask4] \n\t"
+ "or %[dest], %[dest], %[src37] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0),
+ [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest),
+ [src37] "=&f"(src37)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4)
+ : "memory");
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) {
+ uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2;
+ uint64_t tmp0, tmp1;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x002300440011ULL;
+ const uint64_t mask2 = 0x002D00580016ULL;
+ const uint64_t mask3 = 0x003200620018ULL;
+ const uint64_t mask4 = 0xFF000000FF000000ULL;
+ const uint64_t shift = 0x07;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "and %[dest37], %[dest], %[mask4] \n\t"
+
+ "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t"
+ "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t"
+ "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t"
+ "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t"
+ "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
+ "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
+ "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+
+ "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t"
+ "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t"
+ "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t"
+ "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t"
+ "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t"
+ "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t"
+ "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "or %[dest], %[dest], %[dest37] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1),
+ [dest] "=&f"(dest)
+ : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+ [mask4] "f"(mask4), [shift] "f"(shift)
+ : "memory");
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2,
+ dest3;
+ uint64_t matrix, matrix_hi, matrix_lo;
+ uint64_t tmp0, tmp1;
+ const uint64_t shift0 = 0x06;
+ const uint64_t shift1 = 0x08;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+
+ "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest0], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest0], %[dest0], %[shift0] \n\t"
+
+ "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest1], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest1], %[dest1], %[shift0] \n\t"
+
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+
+ "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest2], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest2], %[dest2], %[shift0] \n\t"
+
+ "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t"
+ "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t"
+ "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t"
+ "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t"
+ "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t"
+ "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t"
+ "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t"
+ "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t"
+ "paddw %[dest3], %[tmp0], %[tmp1] \n\t"
+ "psraw %[dest3], %[dest3], %[shift0] \n\t"
+
+ "packsswh %[tmp0], %[dest0], %[dest1] \n\t"
+ "packsswh %[tmp1], %[dest2], %[dest3] \n\t"
+ "packushb %[dest], %[tmp0], %[tmp1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest),
+ [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi),
+ [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix)
+ : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0),
+ [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1)
+ : "memory");
+}
+
+void ARGBShadeRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width,
+ uint32_t value) {
+ uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[src] \n\t"
+ "punpckhbh %[src_hi], %[src], %[src] \n\t"
+
+ "punpcklbh %[value], %[value], %[value] \n\t"
+
+ "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src),
+ [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width),
+ [value] "f"(value), [shift] "f"(shift)
+ : "memory");
+}
+
+void ARGBMultiplyRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo;
+ uint64_t dest, dest_lo, dest_hi;
+ const uint64_t mask = 0x0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[src0] \n\t"
+ "punpckhbh %[src0_hi], %[src0], %[src0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask] \n\t"
+
+ "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t"
+ "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0),
+ [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask)
+ : "memory");
+}
+
+void ARGBAddRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "paddusb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBSubtractRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "psubusb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [width] "r"(width)
+ : "memory");
+}
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int width) {
+ uint64_t y00 = 0, y10 = 0, y20 = 0;
+ uint64_t y02 = 0, y12 = 0, y22 = 0;
+ uint64_t zero = 0x0;
+ uint64_t sobel = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
+ "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2]
+ "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i]
+ "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2]
+ "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
+
+ "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i]
+ "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t"
+ "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2]
+ "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y20], %[y20], %[zero] \n\t"
+
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+ "punpcklbh %[y22], %[y22], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y10] \n\t" // a+b
+ "paddh %[y20], %[y20], %[y10] \n\t" // c+b
+ "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c
+
+ "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub
+ "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub
+ "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub
+
+ "pmaxsh %[y10], %[y00], %[y02] \n\t"
+ "pminsh %[y20], %[y00], %[y02] \n\t"
+ "psubh %[sobel], %[y10], %[y20] \n\t" // Abs
+
+ "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
+ "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
+ "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
+ "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
+ "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
+
+ "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t"
+ "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t"
+ "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t"
+ "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y20], %[y20], %[zero] \n\t"
+
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+ "punpcklbh %[y22], %[y22], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y10] \n\t"
+ "paddh %[y20], %[y20], %[y10] \n\t"
+ "paddh %[y00], %[y00], %[y20] \n\t"
+
+ "paddh %[y02], %[y02], %[y12] \n\t"
+ "paddh %[y22], %[y22], %[y12] \n\t"
+ "paddh %[y02], %[y02], %[y22] \n\t"
+
+ "pmaxsh %[y10], %[y00], %[y02] \n\t"
+ "pminsh %[y20], %[y00], %[y02] \n\t"
+ "psubh %[y00], %[y10], %[y20] \n\t"
+
+ "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
+ "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t"
+
+ "daddiu %[src_y0], %[src_y0], 8 \n\t"
+ "daddiu %[src_y1], %[src_y1], 8 \n\t"
+ "daddiu %[src_y2], %[src_y2], 8 \n\t"
+ "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10),
+ [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22)
+ : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2),
+ [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero)
+ : "memory");
+}
+
+void SobelYRow_MMI(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int width) {
+ uint64_t y00 = 0, y01 = 0, y02 = 0;
+ uint64_t y10 = 0, y11 = 0, y12 = 0;
+ uint64_t zero = 0x0;
+ uint64_t sobel = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i]
+ "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t"
+ "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1]
+ "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2]
+ "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i]
+ "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t"
+ "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1]
+ "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2]
+ "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y01], %[y01], %[zero] \n\t"
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y11], %[y11], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y01] \n\t" // a+b
+ "paddh %[y02], %[y02], %[y01] \n\t" // c+b
+ "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c
+
+ "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub
+ "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub
+ "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub
+
+ "pmaxsh %[y02], %[y00], %[y10] \n\t"
+ "pminsh %[y12], %[y00], %[y10] \n\t"
+ "psubh %[sobel], %[y02], %[y12] \n\t" // Abs
+
+ "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t"
+ "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t"
+ "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t"
+ "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t"
+ "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t"
+ "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t"
+
+ "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t"
+ "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t"
+ "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t"
+ "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t"
+ "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t"
+ "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t"
+
+ "punpcklbh %[y00], %[y00], %[zero] \n\t"
+ "punpcklbh %[y01], %[y01], %[zero] \n\t"
+ "punpcklbh %[y02], %[y02], %[zero] \n\t"
+
+ "punpcklbh %[y10], %[y10], %[zero] \n\t"
+ "punpcklbh %[y11], %[y11], %[zero] \n\t"
+ "punpcklbh %[y12], %[y12], %[zero] \n\t"
+
+ "paddh %[y00], %[y00], %[y01] \n\t"
+ "paddh %[y02], %[y02], %[y01] \n\t"
+ "paddh %[y00], %[y00], %[y02] \n\t"
+
+ "paddh %[y10], %[y10], %[y11] \n\t"
+ "paddh %[y12], %[y12], %[y11] \n\t"
+ "paddh %[y10], %[y10], %[y12] \n\t"
+
+ "pmaxsh %[y02], %[y00], %[y10] \n\t"
+ "pminsh %[y12], %[y00], %[y10] \n\t"
+ "psubh %[y00], %[y02], %[y12] \n\t"
+
+ "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255
+ "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t"
+ "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t"
+
+ "daddiu %[src_y0], %[src_y0], 8 \n\t"
+ "daddiu %[src_y1], %[src_y1], 8 \n\t"
+ "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01),
+ [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12)
+ : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1),
+ [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero)
+ : "memory");
+}
+
+void SobelRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ double temp[3];
+ uint64_t c1 = 0xff000000ff000000;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i]
+ "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t"
+ // s7 s6 s5 s4 s3 s2 s1 s0 = a+b
+ "paddusb %[t2] , %[t0], %[t1] \n\t"
+
+ // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0
+ "punpcklbh %[t0], %[t2], %[t2] \n\t"
+
+ // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0
+ "punpcklbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ // 255 s1 s1 s1 s55 s0 s0 s0
+ "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t"
+
+ // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2
+ "punpckhbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ // 255 s3 s3 s3 255 s2 s2 s2
+ "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t"
+
+ // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4
+ "punpckhbh %[t0], %[t2], %[t2] \n\t"
+
+ // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4
+ "punpcklbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t"
+
+ // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6
+ "punpckhbh %[t1], %[t0], %[t0] \n\t"
+ "or %[t1], %[t1], %[c1] \n\t"
+ "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t"
+
+ "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+ : "memory");
+}
+
+void SobelToPlaneRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
+ int width) {
+ uint64_t tr = 0;
+ uint64_t tb = 0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t"
+ "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t"
+ "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "paddusb %[tr], %[tr], %[tb] \n\t" // g
+ "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t"
+
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [tr] "=&f"(tr), [tb] "=&f"(tb)
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_y] "r"(dst_y), [width] "r"(width)
+ : "memory");
+}
+
+void SobelXYRow_MMI(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t temp[3];
+ uint64_t result = 0;
+ uint64_t gb = 0;
+ uint64_t cr = 0;
+ uint64_t c1 = 0xffffffffffffffff;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t"
+ "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i]
+ "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t"
+ "paddusb %[tg] , %[tr], %[tb] \n\t" // g
+
+ // g3 b3 g2 b2 g1 b1 g0 b0
+ "punpcklbh %[gb], %[tb], %[tg] \n\t"
+ // c3 r3 r2 r2 c1 r1 c0 r0
+ "punpcklbh %[cr], %[tr], %[c1] \n\t"
+ // c1 r1 g1 b1 c0 r0 g0 b0
+ "punpcklhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t"
+ // c3 r3 g3 b3 c2 r2 g2 b2
+ "punpckhhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t"
+
+ // g7 b7 g6 b6 g5 b5 g4 b4
+ "punpckhbh %[gb], %[tb], %[tg] \n\t"
+ // c7 r7 c6 r6 c5 r5 c4 r4
+ "punpckhbh %[cr], %[tr], %[c1] \n\t"
+ // c5 r5 g5 b5 c4 r4 g4 b4
+ "punpcklhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t"
+ // c7 r7 g7 b7 c6 r6 g6 b6
+ "punpckhhw %[result], %[gb], %[cr] \n\t"
+ "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t"
+ "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t"
+
+ "daddiu %[dst_argb], %[dst_argb], 32 \n\t"
+ "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t"
+ "daddiu %[src_sobely], %[src_sobely], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]),
+ [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result)
+ : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely),
+ [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1)
+ : "memory");
+}
+
+void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ // Copy a Y to RGB.
+ uint64_t src, dest;
+ const uint64_t mask0 = 0x00ffffff00ffffffULL;
+ const uint64_t mask1 = ~mask0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src], %[src], %[src] \n\t"
+ "punpcklhw %[dest], %[src], %[src] \n\t"
+ "and %[dest], %[dest], %[mask0] \n\t"
+ "or %[dest], %[dest], %[mask1] \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+
+ "punpckhhw %[dest], %[src], %[src] \n\t"
+ "and %[dest], %[dest], %[mask0] \n\t"
+ "or %[dest], %[dest], %[mask1] \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
+ uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x55;
+ const uint64_t mask2 = 0xAA;
+ const uint64_t mask3 = 0xFF;
+ const uint64_t mask4 = 0x4A354A354A354A35ULL;
+ const uint64_t mask5 = 0x0488048804880488ULL;
+ const uint64_t shift0 = 0x08;
+ const uint64_t shift1 = 0x06;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t"
+
+ "pshufh %[src], %[src_lo], %[mask0] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_lo], %[mask1] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_lo], %[mask2] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_lo], %[mask3] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_hi], %[mask0] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_hi], %[mask1] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+
+ "pshufh %[src], %[src_hi], %[mask2] \n\t"
+ "psllh %[dest_lo], %[src], %[shift0] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src] \n\t"
+ "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t"
+ "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t"
+ "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t"
+ "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t"
+ "pshufh %[src], %[src_hi], %[mask3] \n\t"
+ "psllh %[dest_hi], %[src], %[shift0] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src] \n\t"
+ "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t"
+ "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t"
+ "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t"
+ "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t"
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo)
+ : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3),
+ [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0),
+ [shift1] "f"(shift1), [width] "r"(width)
+ : "memory");
+}
+
+void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, src0, src1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x1b;
+
+ src += width - 1;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[source], 0(%[src_ptr]) \n\t"
+ "gsldrc1 %[source], -7(%[src_ptr]) \n\t"
+ "punpcklbh %[src0], %[source], %[mask0] \n\t"
+ "pshufh %[src0], %[src0], %[mask1] \n\t"
+ "punpckhbh %[src1], %[source], %[mask0] \n\t"
+ "pshufh %[src1], %[src1], %[mask1] \n\t"
+ "packushb %[dest], %[src1], %[src0] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], -0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0),
+ [src1] "=&f"(src1)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void MirrorUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t src0, src1, dest0, dest1;
+ const uint64_t mask0 = 0x00ff00ff00ff00ffULL;
+ const uint64_t mask1 = 0x1b;
+ const uint64_t shift = 0x08;
+
+ src_uv += (width - 1) << 1;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 1(%[src_ptr]) \n\t"
+ "gsldrc1 %[src0], -6(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], -7(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], -14(%[src_ptr]) \n\t"
+
+ "and %[dest0], %[src0], %[mask0] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "and %[dest1], %[src1], %[mask0] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t"
+
+ "psrlh %[dest0], %[src0], %[shift] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "psrlh %[dest1], %[src1], %[shift] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t"
+
+ "daddi %[src_ptr], %[src_ptr], -0x10 \n\t"
+ "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t"
+ "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+ [src1] "=&f"(src1)
+ : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v),
+ [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ src += (width - 1) * 4;
+ uint64_t temp = 0x0;
+ uint64_t shuff = 0x4e; // 01 00 11 10
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[temp], 3(%[src]) \n\t"
+ "gsldrc1 %[temp], -4(%[src]) \n\t"
+ "pshufh %[temp], %[temp], %[shuff] \n\t"
+ "gssdrc1 %[temp], 0x0(%[dst]) \n\t"
+ "gssdlc1 %[temp], 0x7(%[dst]) \n\t"
+
+ "daddiu %[src], %[src], -0x08 \n\t"
+ "daddiu %[dst], %[dst], 0x08 \n\t"
+ "daddiu %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [temp] "=&f"(temp)
+ : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff)
+ : "memory");
+}
+
+void SplitUVRow_MMI(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t"
+
+ "and %[t2], %[t0], %[c0] \n\t"
+ "and %[t3], %[t1], %[c0] \n\t"
+ "packushb %[t2], %[t2], %[t3] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t"
+
+ "psrlh %[t2], %[t0], %[shift] \n\t"
+ "psrlh %[t3], %[t1], %[shift] \n\t"
+ "packushb %[t2], %[t2], %[t3] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t"
+
+ "daddiu %[src_uv], %[src_uv], 16 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [t3] "=&f"(temp[3])
+ : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+void MergeUVRow_MMI(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
+ int width) {
+ uint64_t temp[3];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_u]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_u]) \n\t"
+ "gsldrc1 %[t1], 0x0(%[src_v]) \n\t"
+ "gsldlc1 %[t1], 0x7(%[src_v]) \n\t"
+ "punpcklbh %[t2], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t"
+ "punpckhbh %[t2], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t"
+ "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t"
+
+ "daddiu %[src_u], %[src_u], 8 \n\t"
+ "daddiu %[src_v], %[src_v], 8 \n\t"
+ "daddiu %[dst_uv], %[dst_uv], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2])
+ : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [width] "r"(width)
+ : "memory");
+}
+
+void SplitRGBRow_MMI(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ uint64_t src[4];
+ uint64_t dest_hi, dest_lo, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t"
+ "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t"
+ "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[src1] \n\t"
+ "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t"
+ "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t"
+ "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t"
+ "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src2], %[src3] \n\t"
+
+ "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t"
+ "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t"
+ "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t"
+ "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t"
+ "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]),
+ [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g),
+ [dstb_ptr] "r"(dst_b), [width] "r"(width)
+ : "memory");
+}
+
+void MergeRGBRow_MMI(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ uint64_t srcr, srcg, srcb, dest;
+ uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo;
+ const uint64_t temp = 0x0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t"
+ "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t"
+ "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t"
+ "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t"
+ "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t"
+ "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t"
+
+ "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t"
+ "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t"
+ "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t"
+ "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t"
+
+ "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
+ "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t"
+ "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t"
+ "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
+ "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t"
+ "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest], %[dest], %[dest] \n\t"
+ "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+ "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t"
+
+ "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t"
+ "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t"
+ "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb),
+ [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi),
+ [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi),
+ [srcbz_lo] "=&f"(srcbz_lo)
+ : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b),
+ [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp)
+ : "memory");
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_MMI(const uint8_t* src_yuy2,
+ int src_stride_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0xff00ff00ff00ff00;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t temp[3];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ uint64_t src_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c1] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c1] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+ [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+ : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ uint64_t c0 = 0xff00ff00ff00ff00;
+ uint64_t c1 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c1] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "psrlh %[t0], %[t0], %[shift] \n\t"
+ "psrlh %[t1], %[t1], %[shift] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c1] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+ [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
+ "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t"
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+ : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width),
+ [c0] "f"(c0)
+ : "memory");
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_MMI(const uint8_t* src_uyvy,
+ int src_stride_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[3];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ uint64_t src_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t"
+ "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c0] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t"
+ "pavgb %[t0], %[t0], %[t1] \n\t"
+
+ "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t"
+ "pavgb %[t1], %[t2], %[t1] \n\t"
+
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c0] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]),
+ [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]),
+ [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride)
+ : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy),
+ [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width),
+ [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_MMI(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ // Output a row of UV values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t temp[2];
+ uint64_t data[4];
+ uint64_t shift = 0x08;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d0], %[t0], %[c0] \n\t"
+ "psrlh %[d1], %[t1], %[shift] \n\t"
+
+ "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "mov.s %[t1], %[t0] \n\t"
+ "and %[d2], %[t0], %[c0] \n\t"
+ "psrlh %[d3], %[t1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d2] \n\t"
+ "packushb %[d1], %[d1], %[d3] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t"
+ "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t"
+ "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t"
+ "daddiu %[dst_u], %[dst_u], 8 \n\t"
+ "daddiu %[dst_v], %[dst_v], 8 \n\t"
+ "daddiu %[width], %[width], -16 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]),
+ [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v),
+ [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ // Output a row of Y values.
+ uint64_t c0 = 0x00ff00ff00ff00ff;
+ uint64_t shift = 0x08;
+ uint64_t temp[2];
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t"
+ "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t"
+ "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t"
+ "dsrl %[t0], %[t0], %[shift] \n\t"
+ "dsrl %[t1], %[t1], %[shift] \n\t"
+ "and %[t0], %[t0], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "and %[t1], %[t1], %[c0] \n\t"
+ "packushb %[t0], %[t0], %[t1] \n\t"
+ "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t"
+ "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t"
+ "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t"
+ "daddiu %[dst_y], %[dst_y], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1])
+ : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width),
+ [c0] "f"(c0), [shift] "f"(shift)
+ : "memory");
+}
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_MMI(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi,
+ dest_lo;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL;
+ const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+ const uint64_t mask3 = 0xFF;
+ const uint64_t mask4 = ~mask1;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
+
+ "psubush %[alpha], %[mask2], %[src0_lo] \n\t"
+ "pshufh %[alpha], %[alpha], %[mask3] \n\t"
+ "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t"
+
+ "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
+
+ "psubush %[alpha], %[mask2], %[src0_hi] \n\t"
+ "pshufh %[alpha], %[alpha], %[mask3] \n\t"
+ "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[mask4] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha),
+ [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo)
+ : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1),
+ [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4),
+ [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+void BlendPlaneRow_MMI(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
+ int width) {
+ uint64_t source0, source1, dest, alph;
+ uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi,
+ dest_lo;
+ uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL;
+ const uint64_t mask2 = 0x00FF00FF00FF00FFULL;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t"
+ "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t"
+ "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t"
+
+ "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t"
+ "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t"
+
+ "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t"
+ "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t"
+ "psubusb %[alpha_r], %[mask1], %[alpha] \n\t"
+ "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t"
+ "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t"
+ "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t"
+ "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t"
+
+ "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t"
+ "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[dest] \n\t"
+ "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+
+ "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t"
+ "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[dest] \n\t"
+ "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t"
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph),
+ [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo),
+ [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo),
+ [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi),
+ [alpha_r] "=&f"(alpha_rev)
+ : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha),
+ [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha;
+ const uint64_t mask0 = 0xFF;
+ const uint64_t mask1 = 0xFF000000FF000000ULL;
+ const uint64_t mask2 = ~mask1;
+ const uint64_t shift = 0x08;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[src] \n\t"
+ "punpckhbh %[src_hi], %[src], %[src] \n\t"
+
+ "pshufh %[alpha], %[src_lo], %[mask0] \n\t"
+ "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t"
+ "pshufh %[alpha], %[src_hi], %[mask0] \n\t"
+ "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "and %[dest], %[dest], %[mask2] \n\t"
+ "and %[src], %[src], %[mask1] \n\t"
+ "or %[dest], %[dest], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift),
+ [width] "r"(width)
+ : "memory");
+}
+
+void ComputeCumulativeSumRow_MMI(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
+ int width) {
+ int64_t row_sum[2] = {0, 0};
+ uint64_t src, dest0, dest1, presrc0, presrc1, dest;
+ const uint64_t mask = 0x0;
+
+ __asm__ volatile(
+ "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t"
+ "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t"
+
+ "1: \n\t"
+ "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t"
+ "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t"
+
+ "punpcklbh %[src], %[src], %[mask] \n\t"
+ "punpcklhw %[dest0], %[src], %[mask] \n\t"
+ "punpckhhw %[dest1], %[src], %[mask] \n\t"
+
+ "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t"
+ "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t"
+
+ "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t"
+ "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t"
+ "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t"
+ "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t"
+
+ "paddw %[dest0], %[row_sum0], %[presrc0] \n\t"
+ "paddw %[dest1], %[row_sum1], %[presrc1] \n\t"
+
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t"
+ "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x01 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]),
+ [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0),
+ [presrc1] "=&f"(presrc1)
+ : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum),
+ [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask)
+ : "memory");
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ int width,
+ int source_y_fraction) {
+ if (source_y_fraction == 0) {
+ __asm__ volatile(
+ "1: \n\t"
+ "ld $t0, 0x0(%[src_ptr]) \n\t"
+ "sd $t0, 0x0(%[dst_ptr]) \n\t"
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ :
+ : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width)
+ : "memory");
+ return;
+ }
+ if (source_y_fraction == 128) {
+ uint64_t uv = 0x0;
+ uint64_t uv_stride = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t"
+ "daddu $t0, %[src_ptr], %[stride] \n\t"
+ "gsldrc1 %[uv_stride], 0x0($t0) \n\t"
+ "gsldlc1 %[uv_stride], 0x7($t0) \n\t"
+
+ "pavgb %[uv], %[uv], %[uv_stride] \n\t"
+ "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+ [stride] "r"((int64_t)src_stride)
+ : "memory");
+ return;
+ }
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ uint64_t temp;
+ uint64_t data[4];
+ uint64_t zero = 0x0;
+ uint64_t c0 = 0x0080008000800080;
+ uint64_t fy0 = 0x0100010001000100;
+ uint64_t shift = 0x8;
+ __asm__ volatile(
+ "pshufh %[fy1], %[fy1], %[zero] \n\t"
+ "psubh %[fy0], %[fy0], %[fy1] \n\t"
+ "1: \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t"
+ "punpcklbh %[d0], %[t0], %[zero] \n\t"
+ "punpckhbh %[d1], %[t0], %[zero] \n\t"
+ "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t"
+ "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t"
+ "punpcklbh %[d2], %[t0], %[zero] \n\t"
+ "punpckhbh %[d3], %[t0], %[zero] \n\t"
+
+ "pmullh %[d0], %[d0], %[fy0] \n\t"
+ "pmullh %[d2], %[d2], %[fy1] \n\t"
+ "paddh %[d0], %[d0], %[d2] \n\t"
+ "paddh %[d0], %[d0], %[c0] \n\t"
+ "psrlh %[d0], %[d0], %[shift] \n\t"
+
+ "pmullh %[d1], %[d1], %[fy0] \n\t"
+ "pmullh %[d3], %[d3], %[fy1] \n\t"
+ "paddh %[d1], %[d1], %[d3] \n\t"
+ "paddh %[d1], %[d1], %[c0] \n\t"
+ "psrlh %[d1], %[d1], %[shift] \n\t"
+
+ "packushb %[d0], %[d0], %[d1] \n\t"
+ "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t"
+ "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t"
+ "daddiu %[src_ptr], %[src_ptr], 8 \n\t"
+ "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]),
+ [d2] "=&f"(data[2]), [d3] "=&f"(data[3])
+ : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1),
+ [dst_ptr] "r"(dst_ptr), [width] "r"(width),
+ [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0),
+ [shift] "f"(shift), [zero] "f"(zero)
+ : "memory");
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
+ int width) {
+ uint64_t source, dest0, dest1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) |
+ ((shuffler[2] & 0x03) << 4) |
+ ((shuffler[3] & 0x03) << 6);
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest0], %[src], %[mask0] \n\t"
+ "pshufh %[dest0], %[dest0], %[mask1] \n\t"
+ "punpckhbh %[dest1], %[src], %[mask0] \n\t"
+ "pshufh %[dest1], %[dest1], %[mask1] \n\t"
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void I422ToYUY2Row_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ uint64_t temp[3];
+ uint64_t vu = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
+ "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
+ "punpcklbh %[tu], %[ty], %[vu] \n\t" // g
+ "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
+ "punpckhbh %[tu], %[ty], %[vu] \n\t" // g
+ "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
+ "daddiu %[src_y], %[src_y], 8 \n\t"
+ "daddiu %[src_u], %[src_u], 4 \n\t"
+ "daddiu %[src_v], %[src_v], 4 \n\t"
+ "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+ [vu] "=&f"(vu)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [dst_frame] "r"(dst_frame), [width] "r"(width)
+ : "memory");
+}
+
+void I422ToUYVYRow_MMI(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
+ int width) {
+ uint64_t temp[3];
+ uint64_t vu = 0x0;
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i]
+ "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i]
+ "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i]
+ "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i]
+ "punpcklbh %[vu], %[tu], %[tv] \n\t" // g
+ "punpcklbh %[tu], %[vu], %[ty] \n\t" // g
+ "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t"
+ "punpckhbh %[tu], %[vu], %[ty] \n\t" // g
+ "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t"
+ "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t"
+ "daddiu %[src_y], %[src_y], 8 \n\t"
+ "daddiu %[src_u], %[src_u], 4 \n\t"
+ "daddiu %[src_v], %[src_v], 4 \n\t"
+ "daddiu %[dst_frame], %[dst_frame], 16 \n\t"
+ "daddiu %[width], %[width], -8 \n\t"
+ "bgtz %[width], 1b \n\t"
+ "nop \n\t"
+ : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]),
+ [vu] "=&f"(vu)
+ : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
+ [dst_frame] "r"(dst_frame), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, dest;
+ const uint64_t mask0 = 0xff000000ff000000ULL;
+ const uint64_t mask1 = ~mask0;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "and %[src], %[src], %[mask0] \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[src], %[dest] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ uint64_t src, dest0, dest1, dest_lo, dest_hi, dest;
+ const uint64_t mask = 0xff000000ff000000ULL;
+ const uint64_t shift = 0x18;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "and %[dest0], %[src], %[mask] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t"
+ "and %[dest1], %[src], %[mask] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+
+ "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t"
+ "and %[dest0], %[src], %[mask] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift] \n\t"
+ "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t"
+ "and %[dest1], %[src], %[mask] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift] \n\t"
+ "packsswh %[dest_hi], %[dest0], %[dest1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask),
+ [shift] "f"(shift), [width] "r"(width)
+ : "memory");
+}
+
+void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) {
+ uint64_t source, dest0, dest1, dest;
+ const uint64_t mask0 = 0x0;
+ const uint64_t mask1 = 0x00ffffff00ffffffULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest0], %[mask0], %[src] \n\t"
+ "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "punpckhbh %[dest0], %[mask0], %[src] \n\t"
+ "punpcklhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t"
+ "punpckhhw %[dest1], %[mask0], %[dest0] \n\t"
+ "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+ "and %[dest], %[dest], %[mask1] \n\t"
+ "or %[dest], %[dest], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0),
+ [mask1] "f"(mask1), [width] "r"(width)
+ : "memory");
+}
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/row_msa.cc b/files/source/row_msa.cc
index f79de1c..5c0239a 100644
--- a/files/source/row_msa.cc
+++ b/files/source/row_msa.cc
@@ -37,17 +37,17 @@
}
// Load YUV 422 pixel data
-#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
- { \
- uint64 y_m; \
- uint32 u_m, v_m; \
- v4i32 zero_m = {0}; \
- y_m = LD(psrc_y); \
- u_m = LW(psrc_u); \
- v_m = LW(psrc_v); \
- out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
- out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \
- out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+ { \
+ uint64_t y_m; \
+ uint32_t u_m, v_m; \
+ v4i32 zero_m = {0}; \
+ y_m = LD(psrc_y); \
+ u_m = LW(psrc_u); \
+ v_m = LW(psrc_v); \
+ out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
+ out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \
+ out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \
}
// Clip input vector elements between 0 to 255
@@ -163,14 +163,14 @@
v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
v8u16 reg8_m, reg9_m; \
\
- src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \
- src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \
- src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \
- src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \
- src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \
- src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \
- src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \
- src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \
+ src0_m = (v16u8)__msa_ld_b((void*)s, 0); \
+ src1_m = (v16u8)__msa_ld_b((void*)s, 16); \
+ src2_m = (v16u8)__msa_ld_b((void*)s, 32); \
+ src3_m = (v16u8)__msa_ld_b((void*)s, 48); \
+ src4_m = (v16u8)__msa_ld_b((void*)t, 0); \
+ src5_m = (v16u8)__msa_ld_b((void*)t, 16); \
+ src6_m = (v16u8)__msa_ld_b((void*)t, 32); \
+ src7_m = (v16u8)__msa_ld_b((void*)t, 48); \
vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
@@ -201,14 +201,14 @@
reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \
argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \
argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \
- src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \
- src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \
- src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \
- src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \
- src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \
- src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \
- src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \
- src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \
+ src0_m = (v16u8)__msa_ld_b((void*)s, 64); \
+ src1_m = (v16u8)__msa_ld_b((void*)s, 80); \
+ src2_m = (v16u8)__msa_ld_b((void*)s, 96); \
+ src3_m = (v16u8)__msa_ld_b((void*)s, 112); \
+ src4_m = (v16u8)__msa_ld_b((void*)t, 64); \
+ src5_m = (v16u8)__msa_ld_b((void*)t, 80); \
+ src6_m = (v16u8)__msa_ld_b((void*)t, 96); \
+ src7_m = (v16u8)__msa_ld_b((void*)t, 112); \
vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \
vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \
vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \
@@ -275,17 +275,17 @@
// Load I444 pixel data
#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
{ \
- uint64 y_m, u_m, v_m; \
+ uint64_t y_m, u_m, v_m; \
v2i64 zero_m = {0}; \
y_m = LD(psrc_y); \
u_m = LD(psrc_u); \
v_m = LD(psrc_v); \
- out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \
- out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \
- out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \
+ out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \
+ out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \
+ out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
}
-void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
@@ -302,7 +302,7 @@
}
}
-void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1, dst2, dst3;
@@ -319,10 +319,10 @@
}
}
-void I422ToYUY2Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width) {
int x;
v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
@@ -343,10 +343,10 @@
}
}
-void I422ToUYVYRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width) {
int x;
v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
@@ -367,10 +367,10 @@
}
}
-void I422ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -390,18 +390,18 @@
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
- STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_y += 8;
src_u += 4;
src_v += 4;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void I422ToRGBARow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -421,23 +421,23 @@
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
- STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
+ STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
src_y += 8;
src_u += 4;
src_v += 4;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void I422AlphaToARGBRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* rgb_buf,
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
- int64 data_a;
+ int64_t data_a;
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
@@ -457,23 +457,23 @@
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
- STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
+ STOREARGB(vec0, vec1, vec2, src3, dst_argb);
src_y += 8;
src_u += 4;
src_v += 4;
src_a += 8;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void I422ToRGB24Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
- int32 width) {
+ int32_t width) {
int x;
- int64 data_u, data_v;
+ int64_t data_u, data_v;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
@@ -510,20 +510,20 @@
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
- ST_UB2(dst0, dst1, rgb_buf, 16);
- ST_UB(dst2, (rgb_buf + 32));
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ ST_UB(dst2, (dst_argb + 32));
src_y += 16;
src_u += 8;
src_v += 8;
- rgb_buf += 48;
+ dst_argb += 48;
}
}
// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
-void I422ToRGB565Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -558,10 +558,10 @@
}
// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
-void I422ToARGB4444Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -598,10 +598,10 @@
}
}
-void I422ToARGB1555Row_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -638,7 +638,7 @@
}
}
-void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
@@ -652,12 +652,12 @@
}
}
-void YUY2ToUVRow_MSA(const uint8* src_yuy2,
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
int src_stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+ const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, dst0, dst1;
@@ -682,9 +682,9 @@
}
}
-void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
@@ -703,7 +703,7 @@
}
}
-void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
@@ -717,12 +717,12 @@
}
}
-void UYVYToUVRow_MSA(const uint8* src_uyvy,
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
int src_stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
+ const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
int x;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, dst0, dst1;
@@ -747,9 +747,9 @@
}
}
-void UYVYToUV422Row_MSA(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
@@ -768,7 +768,7 @@
}
}
-void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -814,13 +814,13 @@
}
}
-void ARGBToUVRow_MSA(const uint8* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* src_argb0_next = src_argb0 + src_stride_argb;
+ const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
@@ -932,7 +932,7 @@
}
}
-void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
@@ -942,10 +942,10 @@
21, 22, 24, 25, 26, 28, 29, 30};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
@@ -956,7 +956,7 @@
}
}
-void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
@@ -966,10 +966,10 @@
21, 20, 26, 25, 24, 30, 29, 28};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
@@ -980,15 +980,15 @@
}
}
-void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
int x;
v16u8 src0, src1, dst0;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
@@ -1014,15 +1014,17 @@
}
}
-void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
int x;
v16u8 src0, src1, dst0;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
@@ -1054,7 +1056,9 @@
}
}
-void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ int width) {
int x;
v16u8 src0, src1;
v16u8 vec0, vec1;
@@ -1062,8 +1066,8 @@
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
@@ -1077,11 +1081,11 @@
}
}
-void ARGBToUV444Row_MSA(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
- int32 width) {
- int32 x;
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int32_t width) {
+ int32_t x;
v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 vec8, vec9, vec10, vec11;
@@ -1094,10 +1098,10 @@
v16i8 zero = {0};
for (x = width; x > 0; x -= 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@@ -1149,9 +1153,9 @@
}
}
-void ARGBMultiplyRow_MSA(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1, dst0;
@@ -1160,8 +1164,8 @@
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
@@ -1188,18 +1192,18 @@
}
}
-void ARGBAddRow_MSA(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
@@ -1209,18 +1213,18 @@
}
}
-void ARGBSubtractRow_MSA(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
@@ -1230,7 +1234,9 @@
}
}
-void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
int x;
v16u8 src0, src1, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
@@ -1239,8 +1245,8 @@
v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
@@ -1295,9 +1301,9 @@
}
}
-void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
- uint8* dst_rgb,
- uint32 dither4,
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ uint32_t dither4,
int width) {
int x;
v16u8 src0, src1, dst0, vec0, vec1;
@@ -1310,8 +1316,8 @@
vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
@@ -1339,15 +1345,15 @@
}
}
-void ARGBShuffleRow_MSA(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
int x;
v16u8 src0, src1, dst0, dst1;
v16i8 vec0;
v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
- int32 val = LW((int32*)shuffler);
+ int32_t val = LW((int32_t*)shuffler);
vec0 = (v16i8)__msa_fill_w(val);
shuffler_vec += vec0;
@@ -1363,10 +1369,10 @@
}
}
-void ARGBShadeRow_MSA(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
+ uint32_t value) {
int x;
v16u8 src0, dst0;
v8u16 vec0, vec1;
@@ -1402,7 +1408,7 @@
}
}
-void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, vec0, vec1, dst0, dst1;
v8u16 reg0;
@@ -1427,7 +1433,7 @@
}
}
-void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
v8u16 reg0, reg1, reg2;
@@ -1468,8 +1474,8 @@
}
}
-void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1;
@@ -1497,8 +1503,8 @@
}
}
-void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
int x;
v8u16 src0, src1;
@@ -1508,8 +1514,8 @@
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
- src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
+ src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
@@ -1547,7 +1553,9 @@
}
}
-void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
int x;
v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -1558,8 +1566,8 @@
v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
- src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
+ src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16);
vec0 = src0 & const_0x1F;
vec1 = src0 & const_0x7E0;
vec2 = src0 & const_0xF800;
@@ -1592,7 +1600,9 @@
}
}
-void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
int x;
v16u8 src0, src1, src2;
v16u8 vec0, vec1, vec2;
@@ -1601,9 +1611,9 @@
v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32);
vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
@@ -1617,7 +1627,7 @@
}
}
-void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, src1, src2;
v16u8 vec0, vec1, vec2;
@@ -1626,9 +1636,9 @@
v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
@@ -1642,7 +1652,9 @@
}
}
-void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
int x;
v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -1654,8 +1666,8 @@
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
- src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
+ src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
+ src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
@@ -1699,7 +1711,7 @@
}
}
-void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@@ -1713,8 +1725,8 @@
v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
- src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
+ src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
+ src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
vec0 = src0 & const_0x1F;
vec1 = src0 & const_0x7E0;
vec2 = src0 & const_0xF800;
@@ -1762,7 +1774,7 @@
}
}
-void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1777,9 +1789,9 @@
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1803,7 +1815,7 @@
}
}
-void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@@ -1818,9 +1830,9 @@
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@@ -1844,14 +1856,14 @@
}
}
-void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint16* s = (const uint16*)src_argb1555;
- const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
+ const uint16_t* s = (const uint16_t*)src_argb1555;
+ const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
int64_t res0, res1;
v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
@@ -1865,10 +1877,10 @@
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
- src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
- src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
- src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ src0 = (v8u16)__msa_ld_b((void*)s, 0);
+ src1 = (v8u16)__msa_ld_b((void*)s, 16);
+ src2 = (v8u16)__msa_ld_b((void*)t, 0);
+ src3 = (v8u16)__msa_ld_b((void*)t, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
vec0 += src2 & const_0x1F;
@@ -1925,14 +1937,14 @@
}
}
-void RGB565ToUVRow_MSA(const uint8* src_rgb565,
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint16* s = (const uint16*)src_rgb565;
- const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
+ const uint16_t* s = (const uint16_t*)src_rgb565;
+ const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
int64_t res0, res1;
v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
@@ -1947,10 +1959,10 @@
v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
for (x = 0; x < width; x += 16) {
- src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
- src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
- src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
- src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+ src0 = (v8u16)__msa_ld_b((void*)s, 0);
+ src1 = (v8u16)__msa_ld_b((void*)s, 16);
+ src2 = (v8u16)__msa_ld_b((void*)t, 0);
+ src3 = (v8u16)__msa_ld_b((void*)t, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
vec0 += src2 & const_0x1F;
@@ -2005,15 +2017,15 @@
}
}
-void RGB24ToUVRow_MSA(const uint8* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
- int64 res0, res1;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
+ int64_t res0, res1;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -2029,12 +2041,12 @@
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
- inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
- inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
- inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
- inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
- inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ inp0 = (v16u8)__msa_ld_b((void*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((void*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((void*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((void*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((void*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((void*)t, 32);
src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
@@ -2110,15 +2122,15 @@
}
}
-void RAWToUVRow_MSA(const uint8* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
- int64 res0, res1;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
+ int64_t res0, res1;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
@@ -2134,12 +2146,12 @@
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
- inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
- inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
- inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
- inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
- inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
- inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ inp0 = (v16u8)__msa_ld_b((void*)s, 0);
+ inp1 = (v16u8)__msa_ld_b((void*)s, 16);
+ inp2 = (v16u8)__msa_ld_b((void*)s, 32);
+ inp3 = (v16u8)__msa_ld_b((void*)t, 0);
+ inp4 = (v16u8)__msa_ld_b((void*)t, 16);
+ inp5 = (v16u8)__msa_ld_b((void*)t, 32);
src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
@@ -2215,13 +2227,13 @@
}
}
-void NV12ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* rgb_buf,
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
- uint64 val0, val1;
+ uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
@@ -2245,20 +2257,20 @@
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
- ST_UB2(dst0, dst1, rgb_buf, 16);
+ ST_UB2(dst0, dst1, dst_argb, 16);
src_y += 8;
src_uv += 8;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void NV12ToRGB565Row_MSA(const uint8* src_y,
- const uint8* src_uv,
- uint8* rgb_buf,
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
int x;
- uint64 val0, val1;
+ uint64_t val0, val1;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
@@ -2281,20 +2293,20 @@
vec1 = (vec1 >> 2) << 5;
vec2 = (vec2 >> 3) << 11;
dst0 = (v16u8)(vec0 | vec1 | vec2);
- ST_UB(dst0, rgb_buf);
+ ST_UB(dst0, dst_rgb565);
src_y += 8;
src_uv += 8;
- rgb_buf += 16;
+ dst_rgb565 += 16;
}
}
-void NV21ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_vu,
- uint8* rgb_buf,
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
- uint64 val0, val1;
+ uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
@@ -2320,16 +2332,16 @@
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
- ST_UB2(dst0, dst1, rgb_buf, 16);
+ ST_UB2(dst0, dst1, dst_argb, 16);
src_y += 8;
src_vu += 8;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void SobelRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
@@ -2341,8 +2353,8 @@
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
vec0 = __msa_adds_u_b(src0, src1);
dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
@@ -2355,18 +2367,18 @@
}
}
-void SobelToPlaneRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_y, 16);
@@ -2376,9 +2388,9 @@
}
}
-void SobelXYRow_MSA(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
int x;
v16u8 src0, src1, vec0, vec1, vec2;
@@ -2386,8 +2398,8 @@
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0);
vec0 = __msa_adds_u_b(src0, src1);
vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
@@ -2404,7 +2416,7 @@
}
}
-void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
@@ -2412,10 +2424,10 @@
v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
dst0);
ST_UB(dst0, dst_y);
@@ -2424,7 +2436,7 @@
}
}
-void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@@ -2432,10 +2444,10 @@
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
@@ -2444,7 +2456,7 @@
}
}
-void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@@ -2452,10 +2464,10 @@
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
@@ -2464,7 +2476,7 @@
}
}
-void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@@ -2472,10 +2484,10 @@
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
@@ -2484,14 +2496,14 @@
}
}
-void ARGBToUVJRow_MSA(const uint8* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3;
v16u8 dst0, dst1;
@@ -2506,14 +2518,14 @@
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
- src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
- src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
- src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
- src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
- src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ src0 = (v16u8)__msa_ld_b((void*)s, 0);
+ src1 = (v16u8)__msa_ld_b((void*)s, 16);
+ src2 = (v16u8)__msa_ld_b((void*)s, 32);
+ src3 = (v16u8)__msa_ld_b((void*)s, 48);
+ src4 = (v16u8)__msa_ld_b((void*)t, 0);
+ src5 = (v16u8)__msa_ld_b((void*)t, 16);
+ src6 = (v16u8)__msa_ld_b((void*)t, 32);
+ src7 = (v16u8)__msa_ld_b((void*)t, 48);
src0 = __msa_aver_u_b(src0, src4);
src1 = __msa_aver_u_b(src1, src5);
src2 = __msa_aver_u_b(src2, src6);
@@ -2524,14 +2536,14 @@
src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
vec0 = __msa_aver_u_b(src4, src6);
vec1 = __msa_aver_u_b(src5, src7);
- src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
- src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
- src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
- src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
- src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
- src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
- src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
- src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+ src0 = (v16u8)__msa_ld_b((void*)s, 64);
+ src1 = (v16u8)__msa_ld_b((void*)s, 80);
+ src2 = (v16u8)__msa_ld_b((void*)s, 96);
+ src3 = (v16u8)__msa_ld_b((void*)s, 112);
+ src4 = (v16u8)__msa_ld_b((void*)t, 64);
+ src5 = (v16u8)__msa_ld_b((void*)t, 80);
+ src6 = (v16u8)__msa_ld_b((void*)t, 96);
+ src7 = (v16u8)__msa_ld_b((void*)t, 112);
src0 = __msa_aver_u_b(src0, src4);
src1 = __msa_aver_u_b(src1, src5);
src2 = __msa_aver_u_b(src2, src6);
@@ -2554,14 +2566,14 @@
}
}
-void BGRAToUVRow_MSA(const uint8* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
@@ -2587,14 +2599,14 @@
}
}
-void ABGRToUVRow_MSA(const uint8* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
v16u8 src0, src1, src2, src3;
v16u8 dst0, dst1;
v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
@@ -2621,14 +2633,14 @@
}
}
-void RGBAToUVRow_MSA(const uint8* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
int src_stride_rgb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
int x;
- const uint8* s = src_rgb0;
- const uint8* t = src_rgb0 + src_stride_rgb;
+ const uint8_t* s = src_rgb0;
+ const uint8_t* t = src_rgb0 + src_stride_rgb;
v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15,
@@ -2654,10 +2666,10 @@
}
}
-void I444ToARGBRow_MSA(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* rgb_buf,
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -2714,15 +2726,15 @@
vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
- ST_UB2(dst0, dst1, rgb_buf, 16);
+ ST_UB2(dst0, dst1, dst_argb, 16);
src_y += 8;
src_u += 8;
src_v += 8;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
v8i16 vec0, vec1;
@@ -2734,7 +2746,7 @@
v8i16 zero = {0};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
@@ -2768,19 +2780,19 @@
dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
- ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
src_y += 16;
- rgb_buf += 64;
+ dst_argb += 64;
}
}
-void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
int x;
v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_y, 0);
vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
@@ -2795,8 +2807,8 @@
}
}
-void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
- uint8* rgb_buf,
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -2812,19 +2824,19 @@
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
- STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_yuy2 += 16;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void UYVYToARGBRow_MSA(const uint8* src_uyvy,
- uint8* rgb_buf,
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
@@ -2840,27 +2852,27 @@
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
- STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
+ STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_uyvy += 16;
- rgb_buf += 32;
+ dst_argb += 32;
}
}
-void InterpolateRow_MSA(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int width,
- int32 source_y_fraction) {
- int32 y1_fraction = source_y_fraction;
- int32 y0_fraction = 256 - y1_fraction;
- uint16 y_fractions;
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
+ int32_t source_y_fraction) {
+ int32_t y1_fraction = source_y_fraction;
+ int32_t y0_fraction = 256 - y1_fraction;
+ uint16_t y_fractions;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
int x;
v16u8 src0, src1, src2, src3, dst0, dst1;
v8u16 vec0, vec1, vec2, vec3, y_frac;
@@ -2872,10 +2884,10 @@
if (128 == y1_fraction) {
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
- src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src0 = (v16u8)__msa_ld_b((void*)s, 0);
+ src1 = (v16u8)__msa_ld_b((void*)s, 16);
+ src2 = (v16u8)__msa_ld_b((void*)t, 0);
+ src3 = (v16u8)__msa_ld_b((void*)t, 16);
dst0 = __msa_aver_u_b(src0, src2);
dst1 = __msa_aver_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_ptr, 16);
@@ -2886,14 +2898,14 @@
return;
}
- y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
+ y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
y_frac = (v8u16)__msa_fill_h(y_fractions);
for (x = 0; x < width; x += 32) {
- src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
- src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src0 = (v16u8)__msa_ld_b((void*)s, 0);
+ src1 = (v16u8)__msa_ld_b((void*)s, 16);
+ src2 = (v16u8)__msa_ld_b((void*)t, 0);
+ src3 = (v16u8)__msa_ld_b((void*)t, 16);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
@@ -2915,9 +2927,9 @@
}
}
-void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
int x;
- v16u8 dst0 = (v16u8)__msa_fill_w(v32);
+ v4i32 dst0 = __builtin_msa_fill_w(v32);
for (x = 0; x < width; x += 4) {
ST_UB(dst0, dst_argb);
@@ -2925,7 +2937,7 @@
}
}
-void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
int x;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
@@ -2935,9 +2947,9 @@
24, 23, 28, 27, 26, 31, 30, 29};
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
- src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
+ src0 = (v16u8)__msa_ld_b((void*)src_raw, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_raw, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_raw, 32);
src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
@@ -2950,16 +2962,16 @@
}
}
-void MergeUVRow_MSA(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_MSA(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
int x;
v16u8 src0, src1, dst0, dst1;
for (x = 0; x < width; x += 16) {
- src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
- src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
+ src0 = (v16u8)__msa_ld_b((void*)src_u, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_v, 0);
dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
ST_UB2(dst0, dst1, dst_uv, 16);
@@ -2969,6 +2981,529 @@
}
}
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ int i;
+ v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+
+ for (i = 0; i < width; i += 16) {
+ src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
+ vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_a);
+ src_argb += 64;
+ dst_a += 16;
+ }
+}
+
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1;
+ v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
+ v8u16 const_256 = (v8u16)__msa_ldi_h(256);
+ v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+ v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
+ vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+ vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+ vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+ vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+ vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+ vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+ vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
+ vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
+ vec8 = (v8u16)__msa_fill_h(vec0[3]);
+ vec9 = (v8u16)__msa_fill_h(vec0[7]);
+ vec10 = (v8u16)__msa_fill_h(vec1[3]);
+ vec11 = (v8u16)__msa_fill_h(vec1[7]);
+ vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+ vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+ vec10 = (v8u16)__msa_fill_h(vec2[3]);
+ vec11 = (v8u16)__msa_fill_h(vec2[7]);
+ vec12 = (v8u16)__msa_fill_h(vec3[3]);
+ vec13 = (v8u16)__msa_fill_h(vec3[7]);
+ vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+ vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
+ vec8 = const_256 - vec8;
+ vec9 = const_256 - vec9;
+ vec10 = const_256 - vec10;
+ vec11 = const_256 - vec11;
+ vec8 *= vec4;
+ vec9 *= vec5;
+ vec10 *= vec6;
+ vec11 *= vec7;
+ vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
+ vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
+ vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
+ vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
+ vec0 += vec8;
+ vec1 += vec9;
+ vec2 += vec10;
+ vec3 += vec11;
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst0 = __msa_bmnz_v(dst0, const_255, mask);
+ dst1 = __msa_bmnz_v(dst1, const_255, mask);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb0 += 32;
+ src_argb1 += 32;
+ dst_argb += 32;
+ }
+}
+
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+ int scale,
+ int interval_size,
+ int interval_offset,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v4i32 vec_scale = __msa_fill_w(scale);
+ v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
+ v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
+ v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
+ v16i8 zero = {0};
+
+ for (x = 0; x < width; x += 8) {
+ src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0);
+ src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16);
+ src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32);
+ src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48);
+ vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
+ vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
+ vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+ vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+ vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+ vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+ vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
+ vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
+ tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+ tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+ tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+ tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+ tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
+ tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
+ tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
+ tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
+ tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
+ tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
+ tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
+ tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
+ tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
+ tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
+ tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
+ tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
+ tmp0 *= vec_scale;
+ tmp1 *= vec_scale;
+ tmp2 *= vec_scale;
+ tmp3 *= vec_scale;
+ tmp4 *= vec_scale;
+ tmp5 *= vec_scale;
+ tmp6 *= vec_scale;
+ tmp7 *= vec_scale;
+ tmp8 *= vec_scale;
+ tmp9 *= vec_scale;
+ tmp10 *= vec_scale;
+ tmp11 *= vec_scale;
+ tmp12 *= vec_scale;
+ tmp13 *= vec_scale;
+ tmp14 *= vec_scale;
+ tmp15 *= vec_scale;
+ tmp0 >>= 16;
+ tmp1 >>= 16;
+ tmp2 >>= 16;
+ tmp3 >>= 16;
+ tmp4 >>= 16;
+ tmp5 >>= 16;
+ tmp6 >>= 16;
+ tmp7 >>= 16;
+ tmp8 >>= 16;
+ tmp9 >>= 16;
+ tmp10 >>= 16;
+ tmp11 >>= 16;
+ tmp12 >>= 16;
+ tmp13 >>= 16;
+ tmp14 >>= 16;
+ tmp15 >>= 16;
+ vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+ vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+ vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+ vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+ vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+ vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+ dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+ dst0 *= vec_int_sz;
+ dst1 *= vec_int_sz;
+ dst2 *= vec_int_sz;
+ dst3 *= vec_int_sz;
+ dst0 += vec_int_ofst;
+ dst1 += vec_int_ofst;
+ dst2 += vec_int_ofst;
+ dst3 += vec_int_ofst;
+ dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
+ dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
+ dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
+ dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
+ ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+ dst_argb += 64;
+ }
+}
+
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
+ int width) {
+ int32_t x;
+ v16i8 src0;
+ v16u8 src1, src2, dst0, dst1;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v16i8 zero = {0};
+ v8i16 max = __msa_ldi_h(255);
+
+ src0 = __msa_ld_b((void*)matrix_argb, 0);
+ vec0 = (v8i16)__msa_ilvr_b(zero, src0);
+ vec1 = (v8i16)__msa_ilvl_b(zero, src0);
+
+ for (x = 0; x < width; x += 8) {
+ src1 = (v16u8)__msa_ld_b((void*)src_argb, 0);
+ src2 = (v16u8)__msa_ld_b((void*)src_argb, 16);
+ vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+ vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+ vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+ vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+ vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
+ vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
+ vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
+ vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
+ vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
+ vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
+ vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
+ vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
+ vec10 = vec2 * vec0;
+ vec11 = vec2 * vec1;
+ vec12 = vec6 * vec0;
+ vec13 = vec6 * vec1;
+ tmp0 = __msa_hadd_s_w(vec10, vec10);
+ tmp1 = __msa_hadd_s_w(vec11, vec11);
+ tmp2 = __msa_hadd_s_w(vec12, vec12);
+ tmp3 = __msa_hadd_s_w(vec13, vec13);
+ vec14 = vec3 * vec0;
+ vec15 = vec3 * vec1;
+ vec16 = vec7 * vec0;
+ vec17 = vec7 * vec1;
+ tmp4 = __msa_hadd_s_w(vec14, vec14);
+ tmp5 = __msa_hadd_s_w(vec15, vec15);
+ tmp6 = __msa_hadd_s_w(vec16, vec16);
+ tmp7 = __msa_hadd_s_w(vec17, vec17);
+ vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+ vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+ tmp0 = __msa_hadd_s_w(vec10, vec10);
+ tmp1 = __msa_hadd_s_w(vec11, vec11);
+ tmp2 = __msa_hadd_s_w(vec12, vec12);
+ tmp3 = __msa_hadd_s_w(vec13, vec13);
+ tmp0 = __msa_srai_w(tmp0, 6);
+ tmp1 = __msa_srai_w(tmp1, 6);
+ tmp2 = __msa_srai_w(tmp2, 6);
+ tmp3 = __msa_srai_w(tmp3, 6);
+ vec2 = vec4 * vec0;
+ vec6 = vec4 * vec1;
+ vec3 = vec8 * vec0;
+ vec7 = vec8 * vec1;
+ tmp8 = __msa_hadd_s_w(vec2, vec2);
+ tmp9 = __msa_hadd_s_w(vec6, vec6);
+ tmp10 = __msa_hadd_s_w(vec3, vec3);
+ tmp11 = __msa_hadd_s_w(vec7, vec7);
+ vec4 = vec5 * vec0;
+ vec8 = vec5 * vec1;
+ vec5 = vec9 * vec0;
+ vec9 = vec9 * vec1;
+ tmp12 = __msa_hadd_s_w(vec4, vec4);
+ tmp13 = __msa_hadd_s_w(vec8, vec8);
+ tmp14 = __msa_hadd_s_w(vec5, vec5);
+ tmp15 = __msa_hadd_s_w(vec9, vec9);
+ vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+ vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+ vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+ vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+ tmp4 = __msa_hadd_s_w(vec14, vec14);
+ tmp5 = __msa_hadd_s_w(vec15, vec15);
+ tmp6 = __msa_hadd_s_w(vec16, vec16);
+ tmp7 = __msa_hadd_s_w(vec17, vec17);
+ tmp4 = __msa_srai_w(tmp4, 6);
+ tmp5 = __msa_srai_w(tmp5, 6);
+ tmp6 = __msa_srai_w(tmp6, 6);
+ tmp7 = __msa_srai_w(tmp7, 6);
+ vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+ vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+ vec10 = __msa_maxi_s_h(vec10, 0);
+ vec11 = __msa_maxi_s_h(vec11, 0);
+ vec12 = __msa_maxi_s_h(vec12, 0);
+ vec13 = __msa_maxi_s_h(vec13, 0);
+ vec10 = __msa_min_s_h(vec10, max);
+ vec11 = __msa_min_s_h(vec11, max);
+ vec12 = __msa_min_s_h(vec12, max);
+ vec13 = __msa_min_s_h(vec13, max);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
+ ST_UB2(dst0, dst1, dst_argb, 16);
+ src_argb += 32;
+ dst_argb += 32;
+ }
+}
+
+void SplitUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+
+ for (x = 0; x < width; x += 32) {
+ src0 = (v16u8)__msa_ld_b((void*)src_uv, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_uv, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_uv, 32);
+ src3 = (v16u8)__msa_ld_b((void*)src_uv, 48);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+ dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+ dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_u, 16);
+ ST_UB2(dst2, dst3, dst_v, 16);
+ src_uv += 64;
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
+ int x;
+ v16u8 dst0 = (v16u8)__msa_fill_b(v8);
+
+ for (x = 0; x < width; x += 16) {
+ ST_UB(dst0, dst);
+ dst += 16;
+ }
+}
+
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ int x;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
+ v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
+
+ src_uv += (2 * width);
+
+ for (x = 0; x < width; x += 32) {
+ src_uv -= 64;
+ src2 = (v16u8)__msa_ld_b((void*)src_uv, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_uv, 16);
+ src0 = (v16u8)__msa_ld_b((void*)src_uv, 32);
+ src1 = (v16u8)__msa_ld_b((void*)src_uv, 48);
+ dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+ dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+ dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+ ST_UB2(dst0, dst1, dst_v, 16);
+ ST_UB2(dst2, dst3, dst_u, 16);
+ dst_u += 32;
+ dst_v += 32;
+ }
+}
+
+void SobelXRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
+ int32_t width) {
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, dst0;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
+ v16i8 tmp = __msa_ldi_b(8);
+ v16i8 mask1 = mask0 + tmp;
+ v8i16 zero = {0};
+ v8i16 max = __msa_ldi_h(255);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_y0, 16);
+ src2 = (v16u8)__msa_ld_b((void*)src_y1, 0);
+ src3 = (v16u8)__msa_ld_b((void*)src_y1, 16);
+ src4 = (v16u8)__msa_ld_b((void*)src_y2, 0);
+ src5 = (v16u8)__msa_ld_b((void*)src_y2, 16);
+ vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+ vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+ vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+ vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
+ vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+ vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+ vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+ vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+ vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+ vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
+ vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
+ vec0 += vec2;
+ vec1 += vec3;
+ vec4 += vec2;
+ vec5 += vec3;
+ vec0 += vec4;
+ vec1 += vec5;
+ vec0 = __msa_add_a_h(zero, vec0);
+ vec1 = __msa_add_a_h(zero, vec1);
+ vec0 = __msa_maxi_s_h(vec0, 0);
+ vec1 = __msa_maxi_s_h(vec1, 0);
+ vec0 = __msa_min_s_h(max, vec0);
+ vec1 = __msa_min_s_h(max, vec1);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_sobelx);
+ src_y0 += 16;
+ src_y1 += 16;
+ src_y2 += 16;
+ dst_sobelx += 16;
+ }
+}
+
+void SobelYRow_MSA(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
+ int32_t width) {
+ int x;
+ v16u8 src0, src1, dst0;
+ v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+ v8i16 zero = {0};
+ v8i16 max = __msa_ldi_h(255);
+
+ for (x = 0; x < width; x += 16) {
+ src0 = (v16u8)__msa_ld_b((void*)src_y0, 0);
+ src1 = (v16u8)__msa_ld_b((void*)src_y1, 0);
+ vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
+ vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
+ vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+ vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+ vec0 -= vec2;
+ vec1 -= vec3;
+ vec6[0] = src_y0[16] - src_y1[16];
+ vec6[1] = src_y0[17] - src_y1[17];
+ vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
+ vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
+ vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
+ vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
+ vec0 += vec2;
+ vec1 += vec3;
+ vec4 += vec2;
+ vec5 += vec3;
+ vec0 += vec4;
+ vec1 += vec5;
+ vec0 = __msa_add_a_h(zero, vec0);
+ vec1 = __msa_add_a_h(zero, vec1);
+ vec0 = __msa_maxi_s_h(vec0, 0);
+ vec1 = __msa_maxi_s_h(vec1, 0);
+ vec0 = __msa_min_s_h(max, vec0);
+ vec1 = __msa_min_s_h(max, vec1);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+ ST_UB(dst0, dst_sobely);
+ src_y0 += 16;
+ src_y1 += 16;
+ dst_sobely += 16;
+ }
+}
+
+void HalfFloatRow_MSA(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ int i;
+ v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
+ v4f32 mult_vec;
+ v8i16 zero = {0};
+ mult_vec[0] = 1.9259299444e-34f * scale;
+ mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
+
+ for (i = 0; i < width; i += 32) {
+ src0 = (v8u16)__msa_ld_h((void*)src, 0);
+ src1 = (v8u16)__msa_ld_h((void*)src, 16);
+ src2 = (v8u16)__msa_ld_h((void*)src, 32);
+ src3 = (v8u16)__msa_ld_h((void*)src, 48);
+ vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
+ vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
+ vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
+ vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
+ vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
+ vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
+ vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
+ vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
+ fvec0 = __msa_ffint_u_w(vec0);
+ fvec1 = __msa_ffint_u_w(vec1);
+ fvec2 = __msa_ffint_u_w(vec2);
+ fvec3 = __msa_ffint_u_w(vec3);
+ fvec4 = __msa_ffint_u_w(vec4);
+ fvec5 = __msa_ffint_u_w(vec5);
+ fvec6 = __msa_ffint_u_w(vec6);
+ fvec7 = __msa_ffint_u_w(vec7);
+ fvec0 *= mult_vec;
+ fvec1 *= mult_vec;
+ fvec2 *= mult_vec;
+ fvec3 *= mult_vec;
+ fvec4 *= mult_vec;
+ fvec5 *= mult_vec;
+ fvec6 *= mult_vec;
+ fvec7 *= mult_vec;
+ vec0 = ((v4u32)fvec0) >> 13;
+ vec1 = ((v4u32)fvec1) >> 13;
+ vec2 = ((v4u32)fvec2) >> 13;
+ vec3 = ((v4u32)fvec3) >> 13;
+ vec4 = ((v4u32)fvec4) >> 13;
+ vec5 = ((v4u32)fvec5) >> 13;
+ vec6 = ((v4u32)fvec6) >> 13;
+ vec7 = ((v4u32)fvec7) >> 13;
+ dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+ dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
+ dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+ dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+ ST_UH2(dst0, dst1, dst, 8);
+ ST_UH2(dst2, dst3, dst + 16, 8);
+ src += 32;
+ dst += 32;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/row_neon.cc b/files/source/row_neon.cc
index bed14e0..a12fa79 100644
--- a/files/source/row_neon.cc
+++ b/files/source/row_neon.cc
@@ -22,54 +22,42 @@
!defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.32 {d2[0]}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.32 {d2[1]}, [%2]! \n"
+#define READYUV422 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ "vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- MEMACCESS(2) \
- "vld1.8 {d3}, [%2]! \n" \
- "vpaddl.u8 q1, q1 \n" \
- "vrshrn.u16 d2, q1, #1 \n"
+#define READYUV444 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
- MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d2, d3 \n" \
- "vtrn.u32 d2, d3 \n"
+#define READNV12 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "vld1.8 {d0}, [%0]! \n" \
- MEMACCESS(1) \
- "vld1.8 {d2}, [%1]! \n" \
- "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
- "vuzp.u8 d3, d2 \n" \
- "vtrn.u32 d2, d3 \n"
+#define READNV21 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n" /* split odd/even uv apart */ \
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
// Read 8 YUY2
#define READYUY2 \
- MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
@@ -77,26 +65,19 @@
// Read 8 UYVY
#define READUYVY \
- MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
"vtrn.u32 d2, d3 \n"
-#define YUVTORGB_SETUP \
- MEMACCESS([kUVToRB]) \
- "vld1.8 {d24}, [%[kUVToRB]] \n" \
- MEMACCESS([kUVToG]) \
- "vld1.8 {d25}, [%[kUVToG]] \n" \
- MEMACCESS([kUVBiasBGR]) \
- "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
- MEMACCESS([kUVBiasBGR]) \
- "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
- MEMACCESS([kUVBiasBGR]) \
- "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
- MEMACCESS([kYToRgb]) \
- "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
+#define YUVTORGB_SETUP \
+ "vld1.8 {d24}, [%[kUVToRB]] \n" \
+ "vld1.8 {d25}, [%[kUVToG]] \n" \
+ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \
+ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \
+ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n"
#define YUVTORGB \
"vmull.u8 q8, d2, d24 \n" /* u/v B/R component */ \
@@ -125,156 +106,135 @@
"vqshrun.s16 d22, q9, #6 \n" /* R */ \
"vqshrun.s16 d21, q0, #6 \n" /* G */
-void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READYUV444
- YUVTORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV444 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %5, %5, #8 \n"
- MEMACCESS(3)
- "vld1.8 {d23}, [%3]! \n"
- MEMACCESS(4)
- "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %5, %5, #8 \n"
+ "vld1.8 {d23}, [%3]! \n"
+ "vst4.8 {d20, d21, d22, d23}, [%4]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(src_a), // %3
+ "+r"(dst_argb), // %4
+ "+r"(width) // %5
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d19, #255 \n" // d19 modified by YUVTORGB
- MEMACCESS(3)
- "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgba), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n" // YUVTORGB modified d19
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- MEMACCESS(3)
- "vst3.8 {d20, d21, d22}, [%3]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb24), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
#define ARGBTORGB565 \
@@ -284,34 +244,29 @@
"vsri.16 q0, q8, #5 \n" /* RG */ \
"vsri.16 q0, q9, #11 \n" /* RGB */
-void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- ARGBTORGB565
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
#define ARGBTOARGB1555 \
@@ -323,35 +278,30 @@
"vsri.16 q0, q9, #6 \n" /* ARG */ \
"vsri.16 q0, q10, #11 \n" /* ARGB */
-void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB1555
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
#define ARGBTOARGB4444 \
@@ -363,447 +313,488 @@
"vorr d1, d22, d23 \n" /* RA */ \
"vzip.u8 d0, d1 \n" /* BGRA */
-void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- "1: \n"
- READYUV422
- YUVTORGB
- "subs %4, %4, #8 \n"
- "vmov.u8 d23, #255 \n"
- ARGBTOARGB4444
- MEMACCESS(3)
- "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb4444), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d4, #0x0f \n" // vbic bits to clear
+ "1: \n"
+
+ READYUV422 YUVTORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n" ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READYUV400
- YUVTORGB
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUV400 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
+ [kUVToG] "r"(&kYuvI601Constants.kUVToG),
+ [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
+ [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d23, #255 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d20}, [%0]! \n"
- "vmov d21, d20 \n"
- "vmov d22, d20 \n"
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d20", "d21", "d22", "d23"
- );
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23");
}
-void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READNV12
- YUVTORGB
- "subs %3, %3, #8 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
}
-void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READNV21
- YUVTORGB
- "subs %3, %3, #8 \n"
- MEMACCESS(2)
- "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_vu), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READNV21 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
}
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+
+ YUVTORGB_SETUP
+
+ "1: \n"
+
+ READNV12 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+
+ YUVTORGB_SETUP
+
+ "1: \n"
+
+ READNV21 YUVTORGB
+ "subs %3, %3, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READNV12
- YUVTORGB
- "subs %3, %3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
- "+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READNV12 YUVTORGB
+ "subs %3, %3, #8 \n" ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+ "q12", "q13", "q14", "q15");
}
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READYUY2
- YUVTORGB
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READYUY2 YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
}
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "vmov.u8 d23, #255 \n"
- "1: \n"
- READUYVY
- YUVTORGB
- "subs %2, %2, #8 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ asm volatile(YUVTORGB_SETUP
+ "vmov.u8 d23, #255 \n"
+ "1: \n" READUYVY YUVTORGB
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store U
- MEMACCESS(2)
- "vst1.8 {q1}, [%2]! \n" // store V
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load U
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load V
- "subs %3, %3, #16 \n" // 16 processed per loop
- MEMACCESS(2)
- "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
- "bgt 1b \n"
- :
- "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store R
+ "vst1.8 {q1}, [%2]! \n" // store G
+ "vst1.8 {q2}, [%3]! \n" // store B
+ "bgt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "d0", "d1", "d2" // Clobber List
+ );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load R
+ "vld1.8 {q1}, [%1]! \n" // load G
+ "vld1.8 {q2}, [%2]! \n" // load B
+ "subs %4, %4, #16 \n" // 16 processed per loop
+ "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
+ "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
+ "bgt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
- "subs %2, %2, #32 \n" // 32 processed per loop
- MEMACCESS(1)
- "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "q0", "q1" // Clobber List
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
- asm volatile (
- "vdup.8 q0, %2 \n" // duplicate 16 bytes
- "1: \n"
- "subs %1, %1, #16 \n" // 16 bytes per loop
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v8) // %2
- : "cc", "memory", "q0"
- );
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "vdup.8 q0, %2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "q0");
}
-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile (
- "vdup.u32 q0, %2 \n" // duplicate 4 ints
- "1: \n"
- "subs %1, %1, #4 \n" // 4 pixels per loop
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store
- "bgt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "cc", "memory", "q0"
- );
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #4 \n" // 4 pixels per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0");
}
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2 \n"
- "sub %0, #16 \n"
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #16 \n" // 16 pixels per loop.
- "vrev64.8 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
- );
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0");
}
-void MirrorUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- // Start at end of source row.
- "mov r12, #-16 \n"
- "add %0, %0, %3, lsl #1 \n"
- "sub %0, #16 \n"
+ asm volatile(
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
- "subs %3, #8 \n" // 8 pixels per loop.
- "vrev64.8 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // dst += 8
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "r12", "q0"
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "mov r3, #-16 \n"
- "add %0, %0, %2, lsl #2 \n"
- "sub %0, #16 \n"
-
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0], r3 \n" // src -= 16
- "subs %2, #4 \n" // 4 pixels per loop.
- "vrev64.32 q0, q0 \n"
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // dst += 16
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "r3", "q0"
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d4, #255 \n" // Alpha
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d4, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3" // Clobber List
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3" // Clobber List
);
}
@@ -819,22 +810,22 @@
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
@@ -865,24 +856,22 @@
"vorr.u8 d2, d1, d5 \n" /* R */ \
"vorr.u8 d1, d4, d6 \n" /* G */
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
@@ -896,500 +885,447 @@
"vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
"vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // Alpha
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vswp.u8 d1, d3 \n" // swap R, B
- MEMACCESS(1)
- "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
- );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %2, %2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1" // Clobber List
- );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vst1.8 {d1}, [%1]! \n" // store 8 U.
- MEMACCESS(2)
- "vst1.8 {d3}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
+ // RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
);
}
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 U.
- MEMACCESS(2)
- "vst1.8 {d2}, [%2]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
);
}
-void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "add %1, %0, %1 \n" // stride + src_yuy2
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
- "vrhadd.u8 d1, d1, d5 \n" // average rows of U
- "vrhadd.u8 d3, d3, d7 \n" // average rows of V
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 U.
- MEMACCESS(3)
- "vst1.8 {d3}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(stride_yuy2), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
);
}
-void UYVYToUVRow_NEON(const uint8* src_uyvy,
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "add %1, %0, %1 \n" // stride + src_uyvy
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
- "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
- "vrhadd.u8 d0, d0, d4 \n" // average rows of U
- "vrhadd.u8 d2, d2, d6 \n" // average rows of V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 U.
- MEMACCESS(3)
- "vst1.8 {d2}, [%3]! \n" // store 8 V.
- "bgt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(stride_uyvy), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ asm volatile(
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+ "d7" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // shuffler
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
- "subs %2, %2, #4 \n" // 4 processed per loop
- "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
- "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store 4.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
);
}
-void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
- MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 Us
- MEMACCESS(2)
- "vld1.8 {d3}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3"
- );
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
}
-void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
- MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 Us
- MEMACCESS(2)
- "vld1.8 {d2}, [%2]! \n" // load 8 Vs
- "subs %4, %4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
- "bgt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "d0", "d1", "d2", "d3"
- );
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3");
}
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
}
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
- asm volatile (
- "vdup.32 d2, %2 \n" // dither4
- "1: \n"
- MEMACCESS(1)
- "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d20, d20, d2 \n"
- "vqadd.u8 d21, d21, d2 \n"
- "vqadd.u8 d22, d22, d2 \n"
- ARGBTORGB565
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565.
- "bgt 1b \n"
- : "+r"(dst_rgb) // %0
- : "r"(src_argb), // %1
- "r"(dither4), // %2
- "r"(width) // %3
- : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
- );
+ asm volatile(
+ "vdup.32 d2, %2 \n" // dither4
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d20, d20, d2 \n"
+ "vqadd.u8 d21, d21, d2 \n"
+ "vqadd.u8 d22, d22, d2 \n" // add for dither
+ ARGBTORGB565
+ "vst1.8 {q0}, [%0]! \n" // store 8 RGB565.
+ "bgt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
}
-void ARGBToARGB1555Row_NEON(const uint8* src_argb,
- uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
- );
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
}
-void ARGBToARGB4444Row_NEON(const uint8* src_argb,
- uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
int width) {
- asm volatile (
- "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ asm volatile(
+ "vmov.u8 d4, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q3}, [%1]! \n" // store 16 A's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
- "subs %2, %2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q3}, [%1]! \n" // store 16 A's.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
}
// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
- "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
- "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
- "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
- "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlsl.u8 q2, d1, d25 \n" // G
- "vmlsl.u8 q2, d2, d26 \n" // R
- "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+ asm volatile(
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
- "vmull.u8 q3, d2, d24 \n" // R
- "vmlsl.u8 q3, d1, d28 \n" // G
- "vmlsl.u8 q3, d0, d27 \n" // B
- "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
- MEMACCESS(2)
- "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
- );
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+ "q15");
}
+// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
- "vmul.s16 q8, " #QB \
- ", q10 \n" /* B */ \
- "vmls.s16 q8, " #QG \
- ", q11 \n" /* G */ \
- "vmls.s16 q8, " #QR \
- ", q12 \n" /* R */ \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
- "vmul.s16 q9, " #QR \
- ", q10 \n" /* R */ \
- "vmls.s16 q9, " #QG \
- ", q14 \n" /* G */ \
- "vmls.s16 q9, " #QB \
- ", q13 \n" /* B */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb,
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
@@ -1399,17 +1335,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1421,9 +1353,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1438,10 +1368,10 @@
}
// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb,
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
@@ -1451,17 +1381,13 @@
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1473,9 +1399,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1489,10 +1413,10 @@
);
}
-void BGRAToUVRow_NEON(const uint8* src_bgra,
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
@@ -1502,17 +1426,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
- MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
- MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1524,9 +1444,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -1540,10 +1458,10 @@
);
}
-void ABGRToUVRow_NEON(const uint8* src_abgr,
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
@@ -1553,17 +1471,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
- MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
- MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1575,9 +1489,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -1591,10 +1503,10 @@
);
}
-void RGBAToUVRow_NEON(const uint8* src_rgba,
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
@@ -1604,17 +1516,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
- MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
- MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1626,9 +1534,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -1642,10 +1548,10 @@
);
}
-void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
@@ -1655,17 +1561,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
- MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
- MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1677,9 +1579,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -1693,10 +1593,10 @@
);
}
-void RAWToUVRow_NEON(const uint8* src_raw,
+void RAWToUVRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
@@ -1706,17 +1606,13 @@
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
+ "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
- MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
- MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1728,9 +1624,7 @@
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
- MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -1745,901 +1639,815 @@
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_stride_rgb565), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_stride_argb1555), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "add %1, %0, %1 \n" // src_stride + src_argb
- "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
- "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
- "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
- "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
- "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
- "vmov.u16 q15, #0x8080 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
+ // coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
- "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
- "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
- "vrshr.u16 q4, q4, #1 \n" // 2x average
- "vrshr.u16 q5, q5, #1 \n"
- "vrshr.u16 q6, q6, #1 \n"
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop.
- "vmul.s16 q8, q4, q10 \n" // B
- "vmls.s16 q8, q5, q11 \n" // G
- "vmls.s16 q8, q6, q12 \n" // R
- "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
- "vmul.s16 q9, q6, q10 \n" // R
- "vmls.s16 q9, q5, q14 \n" // G
- "vmls.s16 q9, q4, q13 \n" // B
- "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
- "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
- "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
- MEMACCESS(3)
- "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_stride_argb4444), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
- "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+ "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
}
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
}
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d27, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d27 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
- );
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
}
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // R
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // R
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // B
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d1, d4 \n" // B
- "vmlal.u8 q8, d2, d5 \n" // G
- "vmlal.u8 q8, d3, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
- asm volatile (
- "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
- "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
- "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
- "vmov.u8 d7, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q8, d0, d4 \n" // B
- "vmlal.u8 q8, d1, d5 \n" // G
- "vmlal.u8 q8, d2, d6 \n" // R
- "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
- "vqadd.u8 d0, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
- "bgt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
- );
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ asm volatile(
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
}
// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(y1_fraction) // %4
- :
- : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
- );
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "subs %3, #8 \n"
- "blt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
- "bge 8b \n"
+ asm volatile(
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
- "89: \n"
- "adds %3, #8-1 \n"
- "blt 99f \n"
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
- // Blend 1 pixels.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
- MEMACCESS(1)
- "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
- "subs %3, %3, #1 \n" // 1 processed per loop.
- "vmull.u8 q10, d4, d3 \n" // db * a
- "vmull.u8 q11, d5, d3 \n" // dg * a
- "vmull.u8 q12, d6, d3 \n" // dr * a
- "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
- "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
- "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
- "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
- "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
- "vqadd.u8 q0, q0, q2 \n" // + sbg
- "vqadd.u8 d2, d2, d6 \n" // + sr
- "vmov.u8 d3, #255 \n" // a = 255
- MEMACCESS(2)
- "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
- "bge 1b \n"
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
- "99: \n"
+ "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
- );
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
}
// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- // Attenuate 8 pixels.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q10, d0, d3 \n" // b * a
- "vmull.u8 q11, d1, d3 \n" // g * a
- "vmull.u8 q12, d2, d3 \n" // r * a
- "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
- "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
- "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
- );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
}
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb,
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
- asm volatile (
- "vdup.u16 q8, %2 \n"
- "vshr.u16 q8, q8, #1 \n" // scale >>= 1
- "vdup.u16 q9, %3 \n" // interval multiply.
- "vdup.u16 q10, %4 \n" // interval add
+ asm volatile(
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmovl.u8 q0, d0 \n" // b (0 .. 255)
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q2, d4 \n"
- "vqdmulh.s16 q0, q0, q8 \n" // b * scale
- "vqdmulh.s16 q1, q1, q8 \n" // g
- "vqdmulh.s16 q2, q2, q8 \n" // r
- "vmul.u16 q0, q0, q9 \n" // b * interval_size
- "vmul.u16 q1, q1, q9 \n" // g
- "vmul.u16 q2, q2, q9 \n" // r
- "vadd.u16 q0, q0, q10 \n" // b + interval_offset
- "vadd.u16 q1, q1, q10 \n" // g
- "vadd.u16 q2, q2, q10 \n" // r
- "vqmovn.u16 d0, q0 \n"
- "vqmovn.u16 d2, q1 \n"
- "vqmovn.u16 d4, q2 \n"
- MEMACCESS(0)
- "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
- );
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
}
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
- asm volatile (
- "vdup.u32 q0, %3 \n" // duplicate scale value.
- "vzip.u8 d0, d1 \n" // d0 aarrggbb.
- "vshr.u16 q0, q0, #1 \n" // scale / 2.
+ uint32_t value) {
+ asm volatile(
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q10, d20 \n" // b (0 .. 255)
- "vmovl.u8 q11, d22 \n"
- "vmovl.u8 q12, d24 \n"
- "vmovl.u8 q13, d26 \n"
- "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
- "vqrdmulh.s16 q11, q11, d0[1] \n" // g
- "vqrdmulh.s16 q12, q12, d0[2] \n" // r
- "vqrdmulh.s16 q13, q13, d0[3] \n" // a
- "vqmovn.u16 d20, q10 \n"
- "vqmovn.u16 d22, q11 \n"
- "vqmovn.u16 d24, q12 \n"
- "vqmovn.u16 d26, q13 \n"
- MEMACCESS(1)
- "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
- );
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
}
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
- "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
- "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d24 \n" // B
- "vmlal.u8 q2, d1, d25 \n" // G
- "vmlal.u8 q2, d2, d26 \n" // R
- "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
- "vmov d1, d0 \n" // G
- "vmov d2, d0 \n" // R
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
- );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
}
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
// b = (r * 35 + g * 68 + b * 17) >> 7
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
- asm volatile (
- "vmov.u8 d20, #17 \n" // BB coefficient
- "vmov.u8 d21, #68 \n" // BG coefficient
- "vmov.u8 d22, #35 \n" // BR coefficient
- "vmov.u8 d24, #22 \n" // GB coefficient
- "vmov.u8 d25, #88 \n" // GG coefficient
- "vmov.u8 d26, #45 \n" // GR coefficient
- "vmov.u8 d28, #24 \n" // BB coefficient
- "vmov.u8 d29, #98 \n" // BG coefficient
- "vmov.u8 d30, #50 \n" // BR coefficient
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
- "vmull.u8 q2, d0, d20 \n" // B to Sepia B
- "vmlal.u8 q2, d1, d21 \n" // G
- "vmlal.u8 q2, d2, d22 \n" // R
- "vmull.u8 q3, d0, d24 \n" // B to Sepia G
- "vmlal.u8 q3, d1, d25 \n" // G
- "vmlal.u8 q3, d2, d26 \n" // R
- "vmull.u8 q8, d0, d28 \n" // B to Sepia R
- "vmlal.u8 q8, d1, d29 \n" // G
- "vmlal.u8 q8, d2, d30 \n" // R
- "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
- "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
- "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
- MEMACCESS(0)
- "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "q0", "q1", "q2", "q3",
- "q10", "q11", "q12", "q13", "q14", "q15"
- );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+ "q14", "q15");
}
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
- "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+ asm volatile(
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop.
- "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
- "vmovl.u8 q9, d18 \n" // g
- "vmovl.u8 q10, d20 \n" // r
- "vmovl.u8 q11, d22 \n" // a
- "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
- "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
- "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
- "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
- "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
- "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
- "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
- "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
- "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
- "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
- "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
- "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
- "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
- "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
- "vqadd.s16 q12, q12, q4 \n" // Accumulate B
- "vqadd.s16 q13, q13, q5 \n" // Accumulate G
- "vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqadd.s16 q15, q15, q7 \n" // Accumulate A
- "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
- "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
- "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
- "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
- MEMACCESS(1)
- "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
- "q10", "q11", "q12", "q13", "q14", "q15"
- );
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q11, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15");
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vmull.u8 q0, d0, d1 \n" // multiply B
- "vmull.u8 q1, d2, d3 \n" // multiply G
- "vmull.u8 q2, d4, d5 \n" // multiply R
- "vmull.u8 q3, d6, d7 \n" // multiply A
- "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
- "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
- "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
- "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 q0, q0, q2 \n" // add B, G
- "vqadd.u8 q1, q1, q3 \n" // add R, A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqsub.u8 q0, q0, q2 \n" // subtract B, G
- "vqsub.u8 q1, q1, q3 \n" // subtract R, A
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
}
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2647,58 +2455,50 @@
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
- MEMACCESS(1)
- "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d0, d0, d1 \n" // add
- "vmov.u8 d1, d0 \n"
- "vmov.u8 d2, d0 \n"
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
- asm volatile (
- // 16 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
- "subs %3, %3, #16 \n" // 16 processed per loop.
- "vqadd.u8 q0, q0, q1 \n" // add
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
}
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2706,75 +2506,64 @@
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "vmov.u8 d3, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
- MEMACCESS(1)
- "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vqadd.u8 d1, d0, d2 \n" // add
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "q0", "q1"
- );
+ asm volatile(
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1");
}
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0],%5 \n" // top
- MEMACCESS(0)
- "vld1.8 {d1}, [%0],%6 \n"
- "vsubl.u8 q0, d0, d1 \n"
- MEMACCESS(1)
- "vld1.8 {d2}, [%1],%5 \n" // center * 2
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%6 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- MEMACCESS(2)
- "vld1.8 {d2}, [%2],%5 \n" // bottom
- MEMACCESS(2)
- "vld1.8 {d3}, [%2],%6 \n"
- "subs %4, %4, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- MEMACCESS(3)
- "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2), // %5
- "r"(6) // %6
- : "cc", "memory", "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
@@ -2782,99 +2571,317 @@
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0],%4 \n" // left
- MEMACCESS(1)
- "vld1.8 {d1}, [%1],%4 \n"
- "vsubl.u8 q0, d0, d1 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0],%4 \n" // center * 2
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%4 \n"
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vadd.s16 q0, q0, q1 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0],%5 \n" // right
- MEMACCESS(1)
- "vld1.8 {d3}, [%1],%5 \n"
- "subs %3, %3, #8 \n" // 8 pixels
- "vsubl.u8 q1, d2, d3 \n"
- "vadd.s16 q0, q0, q1 \n"
- "vabs.s16 q0, q0 \n"
- "vqmovn.u16 d0, q0 \n"
- MEMACCESS(2)
- "vst1.8 {d0}, [%2]! \n" // store 8 sobely
- "bgt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1), // %4
- "r"(6) // %5
- : "cc", "memory", "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
);
}
-void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
- asm volatile (
- "vdup.32 q0, %3 \n"
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, q0 \n" // adjust exponent
- "vmul.f32 q3, q3, q0 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"(1.9259299444e-34f) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
}
-// TODO(fbarchard): multiply by element.
-void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile (
- "vdup.32 q0, %3 \n"
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q1}, [%0]! \n" // load 8 shorts
- "subs %2, %2, #8 \n" // 8 pixels per loop
- "vmovl.u16 q2, d2 \n" // 8 int's
- "vmovl.u16 q3, d3 \n"
- "vcvt.f32.u32 q2, q2 \n" // 8 floats
- "vcvt.f32.u32 q3, q3 \n"
- "vmul.f32 q2, q2, q0 \n" // adjust exponent
- "vmul.f32 q3, q3, q0 \n"
- "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
- "vqshrn.u32 d3, q3, #13 \n"
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"(scale * 1.9259299444e-34f) // %3
- : "cc", "memory", "q0", "q1", "q2", "q3"
- );
+ "1: \n"
+ "vld1.8 {q1}, [%0]! \n" // load 8 shorts
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u16 q2, d2 \n" // 8 int's
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // adjust exponent
+ "vmul.f32 q3, q3, %y3 \n"
+ "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
+ "vqshrn.u32 d3, q3, #13 \n"
+ "vst1.8 {q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 bytes
+ "subs %2, %2, #8 \n" // 8 pixels per loop
+ "vmovl.u8 q1, d2 \n" // 8 shorts
+ "vmovl.u16 q2, d2 \n" // 8 ints
+ "vmovl.u16 q3, d3 \n"
+ "vcvt.f32.u32 q2, q2 \n" // 8 floats
+ "vcvt.f32.u32 q3, q3 \n"
+ "vmul.f32 q2, q2, %y3 \n" // scale
+ "vmul.f32 q3, q3, %y3 \n"
+ "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "vmov.u16 d6, #4 \n" // constant 4
+ "vmov.u16 d7, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
+ "vld1.16 {q2}, [%4]! \n"
+ "vaddl.u16 q0, d2, d4 \n" // * 1
+ "vaddl.u16 q1, d3, d5 \n" // * 1
+ "vld1.16 {q2}, [%1]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "vld1.16 {q2}, [%2]! \n"
+ "vmlal.u16 q0, d4, d7 \n" // * 6
+ "vmlal.u16 q1, d5, d7 \n" // * 6
+ "vld1.16 {q2}, [%3]! \n"
+ "vmlal.u16 q0, d4, d6 \n" // * 4
+ "vmlal.u16 q1, d5, d6 \n" // * 4
+ "subs %6, %6, #8 \n" // 8 processed per loop
+ "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "vmov.u32 q10, #4 \n" // constant 4
+ "vmov.u32 q11, #6 \n" // constant 6
+
+ "1: \n"
+ "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
+ "vld1.32 {q2}, [%0] \n"
+ "vadd.u32 q0, q0, q1 \n" // * 1
+ "vadd.u32 q1, q1, q2 \n" // * 1
+ "vld1.32 {q2, q3}, [%2]! \n"
+ "vmla.u32 q0, q2, q11 \n" // * 6
+ "vmla.u32 q1, q3, q11 \n" // * 6
+ "vld1.32 {q2, q3}, [%1]! \n"
+ "vld1.32 {q8, q9}, [%3]! \n"
+ "vadd.u32 q2, q2, q8 \n" // add rows for * 4
+ "vadd.u32 q3, q3, q9 \n"
+ "vmla.u32 q0, q2, q10 \n" // * 4
+ "vmla.u32 q1, q3, q10 \n" // * 4
+ "subs %5, %5, #8 \n" // 8 processed per loop
+ "vqshrn.u32 d0, q0, #8 \n" // round and pack
+ "vqshrn.u32 d1, q1, #8 \n"
+ "vst1.u16 {q0}, [%4]! \n" // store 8 samples
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q2}, [%0]! \n" // load 16 Y values
+ "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
+ "vmov d1, d0 \n"
+ "vzip.u8 d0, d1 \n" // VV
+ "vmov d3, d2 \n"
+ "vzip.u8 d2, d3 \n" // UU
+ "subs %3, %3, #16 \n" // 16 pixels per loop
+ "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
+ "vst3.8 {d1, d3, d5}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d0, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ asm volatile(
+ "add %1, %0, %1 \n" // src_stride + src_AYUV
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
+ // pixels.
+ "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
+ // pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
+ // pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
+ "vqrshrun.s16 d1, q1, #2 \n"
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_stride_ayuv), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
+ "bgt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
+ "vld2.8 {d1, d3}, [%0]! \n"
+ "vorr.u8 q2, q0, q0 \n" // move U after V
+ "subs %2, %2, #16 \n" // 16 pixels per loop
+ "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
diff --git a/files/source/row_neon64.cc b/files/source/row_neon64.cc
index ebd685e..f5cbb47 100644
--- a/files/source/row_neon64.cc
+++ b/files/source/row_neon64.cc
@@ -19,54 +19,42 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 8 Y, 4 U and 4 V from 422
-#define READYUV422 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- MEMACCESS(1) \
- "ld1 {v1.s}[0], [%1], #4 \n" \
- MEMACCESS(2) \
- "ld1 {v1.s}[1], [%2], #4 \n"
+#define READYUV422 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.s}[0], [%1], #4 \n" \
+ "ld1 {v1.s}[1], [%2], #4 \n"
// Read 8 Y, 8 U and 8 V from 444
-#define READYUV444 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- MEMACCESS(1) \
- "ld1 {v1.d}[0], [%1], #8 \n" \
- MEMACCESS(2) \
- "ld1 {v1.d}[1], [%2], #8 \n" \
- "uaddlp v1.8h, v1.16b \n" \
- "rshrn v1.8b, v1.8h, #1 \n"
+#define READYUV444 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v1.d}[0], [%1], #8 \n" \
+ "ld1 {v1.d}[1], [%2], #8 \n" \
+ "uaddlp v1.8h, v1.16b \n" \
+ "rshrn v1.8b, v1.8h, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
- MEMACCESS(0) \
"ld1 {v0.8b}, [%0], #8 \n" \
"movi v1.8b , #128 \n"
// Read 8 Y and 4 UV from NV12
-#define READNV12 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- MEMACCESS(1) \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v1.8b, v2.8b, v2.8b \n" \
- "uzp2 v3.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
+#define READNV12 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v1.8b, v2.8b, v2.8b \n" \
+ "uzp2 v3.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 Y and 4 VU from NV21
-#define READNV21 \
- MEMACCESS(0) \
- "ld1 {v0.8b}, [%0], #8 \n" \
- MEMACCESS(1) \
- "ld1 {v2.8b}, [%1], #8 \n" \
- "uzp1 v3.8b, v2.8b, v2.8b \n" \
- "uzp2 v1.8b, v2.8b, v2.8b \n" \
- "ins v1.s[1], v3.s[0] \n"
+#define READNV21 \
+ "ld1 {v0.8b}, [%0], #8 \n" \
+ "ld1 {v2.8b}, [%1], #8 \n" \
+ "uzp1 v3.8b, v2.8b, v2.8b \n" \
+ "uzp2 v1.8b, v2.8b, v2.8b \n" \
+ "ins v1.s[1], v3.s[0] \n"
// Read 8 YUY2
#define READYUY2 \
- MEMACCESS(0) \
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
"uzp2 v3.8b, v1.8b, v1.8b \n" \
"uzp1 v1.8b, v1.8b, v1.8b \n" \
@@ -74,7 +62,6 @@
// Read 8 UYVY
#define READUYVY \
- MEMACCESS(0) \
"ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
"orr v0.8b, v3.8b, v3.8b \n" \
"uzp1 v1.8b, v2.8b, v2.8b \n" \
@@ -125,10 +112,10 @@
".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
-void I444ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -138,7 +125,6 @@
READYUV444
YUVTORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
- MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -155,10 +141,10 @@
);
}
-void I422ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -168,7 +154,6 @@
READYUV422
YUVTORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
- MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -185,11 +170,11 @@
);
}
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ const uint8_t* src_a,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -197,10 +182,8 @@
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
- MEMACCESS(3)
"ld1 {v23.8b}, [%3], #8 \n"
"subs %w5, %w5, #8 \n"
- MEMACCESS(4)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -218,10 +201,10 @@
);
}
-void I422ToRGBARow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgba,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -231,7 +214,6 @@
READYUV422
YUVTORGB(v23, v22, v21)
"subs %w4, %w4, #8 \n"
- MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -248,10 +230,10 @@
);
}
-void I422ToRGB24Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -260,7 +242,6 @@
READYUV422
YUVTORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
- MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -284,34 +265,31 @@
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
-void I422ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- ARGBTORGB565
- MEMACCESS(3)
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_rgb565), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READYUV422 YUVTORGB(
+ v22, v21,
+ v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
}
#define ARGBTOARGB1555 \
@@ -323,35 +301,32 @@
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
-void I422ToARGB1555Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
- asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
- "1: \n"
- READYUV422
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
- ARGBTOARGB1555
- MEMACCESS(3)
- "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_argb1555), // %3
- "+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
- "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
- );
+ asm volatile(
+ YUVTORGB_SETUP
+ "movi v23.8b, #255 \n"
+ "1: \n" READYUV422 YUVTORGB(
+ v22, v21,
+ v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
+ "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
}
#define ARGBTOARGB4444 \
@@ -364,10 +339,10 @@
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
-void I422ToARGB4444Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -379,7 +354,6 @@
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
- MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -396,7 +370,7 @@
);
}
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile (
YUVTORGB_SETUP
"movi v23.8b, #255 \n"
@@ -404,7 +378,6 @@
READYUV400
YUVTORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
- MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -419,29 +392,26 @@
);
}
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
- asm volatile (
- "movi v23.8b, #255 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v20.8b}, [%0], #8 \n"
- "orr v21.8b, v20.8b, v20.8b \n"
- "orr v22.8b, v20.8b, v20.8b \n"
- "subs %w2, %w2, #8 \n"
- MEMACCESS(1)
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v20", "v21", "v22", "v23"
- );
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v23.8b, #255 \n"
+ "1: \n"
+ "ld1 {v20.8b}, [%0], #8 \n"
+ "orr v21.8b, v20.8b, v20.8b \n"
+ "orr v22.8b, v20.8b, v20.8b \n"
+ "subs %w2, %w2, #8 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v20", "v21", "v22", "v23");
}
-void NV12ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -451,7 +421,6 @@
READNV12
YUVTORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
- MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -467,9 +436,9 @@
);
}
-void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
- uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -479,7 +448,6 @@
READNV21
YUVTORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
- MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@@ -495,24 +463,22 @@
);
}
-void NV12ToRGB565Row_NEON(const uint8* src_y,
- const uint8* src_uv,
- uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
- int width) {
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
asm volatile (
YUVTORGB_SETUP
"1: \n"
READNV12
YUVTORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
- ARGBTORGB565
- MEMACCESS(2)
- "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
- "+r"(dst_rgb565), // %2
+ "+r"(dst_rgb24), // %2
"+r"(width) // %3
: [kUVToRB]"r"(&yuvconstants->kUVToRB),
[kUVToG]"r"(&yuvconstants->kUVToG),
@@ -523,8 +489,59 @@
);
}
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
- uint8* dst_argb,
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_rgb24,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile (
+ YUVTORGB_SETUP
+ "1: \n"
+ READNV21
+ YUVTORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_rgb24), // %2
+ "+r"(width) // %3
+ : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+ [kUVToG]"r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_uv,
+ uint8_t* dst_rgb565,
+ const struct YuvConstants* yuvconstants,
+ int width) {
+ asm volatile(
+ YUVTORGB_SETUP
+ "1: \n" READNV12 YUVTORGB(
+ v22, v21,
+ v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
+ // RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+ [kUVToG] "r"(&yuvconstants->kUVToG),
+ [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+ [kYToRgb] "r"(&yuvconstants->kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -534,7 +551,6 @@
READYUY2
YUVTORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
- MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
@@ -549,8 +565,8 @@
);
}
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
- uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
@@ -560,7 +576,6 @@
READUYVY
YUVTORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
- MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
@@ -576,231 +591,250 @@
}
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store U
- MEMACCESS(2)
- "st1 {v1.16b}, [%2], #16 \n" // store V
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store U
+ "st1 {v1.16b}, [%2], #16 \n" // store V
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load U
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load V
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- MEMACCESS(2)
- "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
- "b.gt 1b \n"
- :
- "+r"(src_u), // %0
- "+r"(src_v), // %1
- "+r"(dst_uv), // %2
- "+r"(width) // %3 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load U
+ "ld1 {v1.16b}, [%1], #16 \n" // load V
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
+ "b.gt 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
-// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
- "subs %w2, %w2, #32 \n" // 32 processed per loop
- MEMACCESS(1)
- "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(count) // %2 // Output registers
- : // Input registers
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
- asm volatile (
- "dup v0.16b, %w2 \n" // duplicate 16 bytes
- "1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v8) // %2
- : "cc", "memory", "v0"
- );
-}
-
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
- asm volatile (
- "dup v0.4s, %w2 \n" // duplicate 4 ints
- "1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
- : "+r"(dst), // %0
- "+r"(count) // %1
- : "r"(v32) // %2
- : "cc", "memory", "v0"
- );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
- "rev64 v0.16b, v0.16b \n"
- MEMACCESS(1)
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- MEMACCESS(1)
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0"
- );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+ uint8_t* dst_r,
+ uint8_t* dst_g,
+ uint8_t* dst_b,
int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
- "rev64 v0.8b, v0.8b \n"
- "rev64 v1.8b, v1.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // dst += 8
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_uv), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- : "r"((ptrdiff_t)-16) // %4
- : "cc", "memory", "v0", "v1"
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st1 {v0.16b}, [%1], #16 \n" // store R
+ "st1 {v1.16b}, [%2], #16 \n" // store G
+ "st1 {v2.16b}, [%3], #16 \n" // store B
+ "b.gt 1b \n"
+ : "+r"(src_rgb), // %0
+ "+r"(dst_r), // %1
+ "+r"(dst_g), // %2
+ "+r"(dst_b), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
- asm volatile (
- // Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
- "sub %0, %0, #16 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- "rev64 v0.4s, v0.4s \n"
- MEMACCESS(1)
- "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
- MEMACCESS(1)
- "st1 {v0.D}[0], [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "r"((ptrdiff_t)-16) // %3
- : "cc", "memory", "v0"
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+ const uint8_t* src_g,
+ const uint8_t* src_b,
+ uint8_t* dst_rgb,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load R
+ "ld1 {v1.16b}, [%1], #16 \n" // load G
+ "ld1 {v2.16b}, [%2], #16 \n" // load B
+ "subs %w4, %w4, #16 \n" // 16 processed per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
+ "b.gt 1b \n"
+ : "+r"(src_r), // %0
+ "+r"(src_g), // %1
+ "+r"(src_b), // %2
+ "+r"(dst_rgb), // %3
+ "+r"(width) // %4
+ : // Input registers
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
- asm volatile (
- "movi v4.8b, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ "1: \n"
+ "ldp q0, q1, [%0], #32 \n"
+ "subs %w2, %w2, #32 \n" // 32 processed per loop
+ "stp q0, q1, [%1], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "v0", "v1" // Clobber List
);
}
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
- asm volatile (
- "movi v5.8b, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- MEMACCESS(1)
- "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+ asm volatile(
+ "dup v0.16b, %w2 \n" // duplicate 16 bytes
+ "1: \n"
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v8) // %2
+ : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+ asm volatile(
+ "dup v0.4s, %w2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "st1 {v0.16b}, [%0], #16 \n" // store
+ "b.gt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(width) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "v0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "rev64 v0.16b, v0.16b \n"
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w3, sxtw #1 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
+ "subs %w3, %w3, #8 \n" // 8 pixels per loop.
+ "rev64 v0.8b, v0.8b \n"
+ "rev64 v1.8b, v1.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // dst += 8
+ "st1 {v1.8b}, [%2], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ : "r"((ptrdiff_t)-16) // %4
+ : "cc", "memory", "v0", "v1");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+ asm volatile(
+ // Start at end of source row.
+ "add %0, %0, %w2, sxtw #2 \n"
+ "sub %0, %0, #16 \n"
+ "1: \n"
+ "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "rev64 v0.4s, v0.4s \n"
+ "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
+ "st1 {v0.D}[0], [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((ptrdiff_t)-16) // %3
+ : "cc", "memory", "v0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- MEMACCESS(1)
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v5.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+ asm volatile(
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v3.8b, v1.8b, v1.8b \n" // move g
+ "orr v4.8b, v0.8b, v0.8b \n" // move r
+ "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
@@ -817,22 +851,22 @@
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- MEMACCESS(1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
);
}
@@ -873,24 +907,23 @@
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
- uint8* dst_argb,
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // Alpha
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- MEMACCESS(1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "movi v3.8b, #255 \n" // Alpha
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
@@ -906,477 +939,429 @@
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
- uint8* dst_argb,
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- MEMACCESS(1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- MEMACCESS(1)
- "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v4.8b, v2.8b, v2.8b \n" // mov g
- "orr v5.8b, v1.8b, v1.8b \n" // mov b
- MEMACCESS(1)
- "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_raw), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
- );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
- "subs %w2, %w2, #16 \n" // 16 processed per loop.
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1" // Clobber List
- );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb24,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
- MEMACCESS(2)
- "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
+ // RGB24.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "orr v4.8b, v2.8b, v2.8b \n" // mov g
+ "orr v5.8b, v1.8b, v1.8b \n" // mov b
+ "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
+ "subs %w2, %w2, #16 \n" // 16 processed per loop.
+ "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
- "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
- MEMACCESS(2)
- "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
-void YUY2ToUVRow_NEON(const uint8* src_yuy2,
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
+ "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
- "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
- MEMACCESS(3)
- "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_yuy2), // %0
- "+r"(src_yuy2b), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
- "v5", "v6", "v7" // Clobber List
+ const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
+ "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(src_yuy2b), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
);
}
-void UYVYToUVRow_NEON(const uint8* src_uyvy,
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_uyvyb = src_uyvy + stride_uyvy;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
- "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
- "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
- "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
- MEMACCESS(3)
- "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
- "b.gt 1b \n"
- : "+r"(src_uyvy), // %0
- "+r"(src_uyvyb), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
- "v5", "v6", "v7" // Clobber List
+ const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
+ "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
+ "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
+ "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
+ "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
+ "b.gt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(src_uyvyb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+ "v7" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
- asm volatile (
- MEMACCESS(3)
- "ld1 {v2.16b}, [%3] \n" // shuffler
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
- "subs %w2, %w2, #4 \n" // 4 processed per loop
- "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 4.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(shuffler) // %3
- : "cc", "memory", "v0", "v1", "v2" // Clobber List
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // shuffler
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
+ "subs %w2, %w2, #4 \n" // 4 processed per loop
+ "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
+ "st1 {v1.16b}, [%1], #16 \n" // store 4.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
-void I422ToYUY2Row_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_yuy2,
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_yuy2,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
- "orr v2.8b, v1.8b, v1.8b \n"
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
- MEMACCESS(2)
- "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_yuy2), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v2.8b, v1.8b, v1.8b \n"
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void I422ToUYVYRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uyvy,
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uyvy,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
- "orr v3.8b, v2.8b, v2.8b \n"
- MEMACCESS(1)
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
- MEMACCESS(2)
- "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
- "subs %w4, %w4, #16 \n" // 16 pixels
- MEMACCESS(3)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_y), // %0
- "+r"(src_u), // %1
- "+r"(src_v), // %2
- "+r"(dst_uyvy), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
+ asm volatile(
+ "1: \n"
+ "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
+ "orr v3.8b, v2.8b, v2.8b \n"
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
+ "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
+ "subs %w4, %w4, #16 \n" // 16 pixels
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTORGB565
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_rgb565), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
- );
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb565,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
}
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
- asm volatile (
- "dup v1.4s, %w2 \n" // dither4
- "1: \n"
- MEMACCESS(1)
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v20.8b, v20.8b, v1.8b \n"
- "uqadd v21.8b, v21.8b, v1.8b \n"
- "uqadd v22.8b, v22.8b, v1.8b \n"
- ARGBTORGB565
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
- "b.gt 1b \n"
- : "+r"(dst_rgb) // %0
- : "r"(src_argb), // %1
- "r"(dither4), // %2
- "r"(width) // %3
- : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
- );
+ asm volatile(
+ "dup v1.4s, %w2 \n" // dither4
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v20.8b, v20.8b, v1.8b \n"
+ "uqadd v21.8b, v21.8b, v1.8b \n"
+ "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
+ "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(dst_rgb) // %0
+ : "r"(src_argb), // %1
+ "r"(dither4), // %2
+ "r"(width) // %3
+ : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
}
-void ARGBToARGB1555Row_NEON(const uint8* src_argb,
- uint8* dst_argb1555,
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb1555,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB1555
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb1555), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
- );
+ asm volatile(
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ // ARGB1555.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
}
-void ARGBToARGB4444Row_NEON(const uint8* src_argb,
- uint8* dst_argb4444,
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb4444,
int width) {
- asm volatile (
- "movi v4.16b, #0x0f \n" // bits to clear with vbic.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGBTOARGB4444
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb4444), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
+ asm volatile(
+ "movi v4.16b, #0x0f \n" // bits to clear with
+ // vbic.
+ "1: \n"
+ "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
+ // ARGB4444.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_a,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
+ // pixels
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_a), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #15 \n" // B * 0.11400 coefficient
- "movi v5.8b, #75 \n" // G * 0.58700 coefficient
- "movi v6.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v5.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v6.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb,
- uint8* dst_u,
- uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- asm volatile (
- "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
- "movi v25.8b, #74 \n" // UG -0.5781 coefficient
- "movi v26.8b, #38 \n" // UR -0.2969 coefficient
- "movi v27.8b, #18 \n" // VB -0.1406 coefficient
- "movi v28.8b, #94 \n" // VG -0.7344 coefficient
- "movi v29.16b,#0x80 \n" // 128.5
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlsl v4.8h, v1.8b, v25.8b \n" // G
- "umlsl v4.8h, v2.8b, v26.8b \n" // R
- "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
+ asm volatile(
+ "movi v24.8b, #112 \n" // UB / VR 0.875
+ // coefficient
+ "movi v25.8b, #74 \n" // UG -0.5781 coefficient
+ "movi v26.8b, #38 \n" // UR -0.2969 coefficient
+ "movi v27.8b, #18 \n" // VB -0.1406 coefficient
+ "movi v28.8b, #94 \n" // VG -0.7344 coefficient
+ "movi v29.16b,#0x80 \n" // 128.5
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ // pixels.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlsl v4.8h, v1.8b, v25.8b \n" // G
+ "umlsl v4.8h, v2.8b, v26.8b \n" // R
+ "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
- "umull v3.8h, v2.8b, v24.8b \n" // R
- "umlsl v3.8h, v1.8b, v28.8b \n" // G
- "umlsl v3.8h, v0.8b, v27.8b \n" // B
- "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
+ "umull v3.8h, v2.8b, v24.8b \n" // R
+ "umlsl v3.8h, v1.8b, v28.8b \n" // G
+ "umlsl v3.8h, v0.8b, v27.8b \n" // B
+ "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
- MEMACCESS(2)
- "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_u), // %1
- "+r"(dst_v), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
- "v24", "v25", "v26", "v27", "v28", "v29"
- );
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+ "v27", "v28", "v29");
}
#define RGBTOUV_SETUP_REG \
@@ -1388,43 +1373,37 @@
"movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// clang-format off
#define RGBTOUV(QB, QG, QR) \
- "mul v3.8h, " #QB \
- ",v20.8h \n" /* B */ \
- "mul v4.8h, " #QR \
- ",v20.8h \n" /* R */ \
- "mls v3.8h, " #QG \
- ",v21.8h \n" /* G */ \
- "mls v4.8h, " #QG \
- ",v24.8h \n" /* G */ \
- "mls v3.8h, " #QR \
- ",v22.8h \n" /* R */ \
- "mls v4.8h, " #QB \
- ",v23.8h \n" /* B */ \
+ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
+ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
+ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
+ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
+ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
+ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
+// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
-void ARGBToUVRow_NEON(const uint8* src_argb,
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_argb_1 = src_argb + src_stride_argb;
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@@ -1436,9 +1415,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_argb), // %0
@@ -1453,12 +1430,12 @@
}
// TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb,
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_argb_1 = src_argb + src_stride_argb;
+ const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
"movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
@@ -1467,12 +1444,10 @@
"movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
- MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@@ -1484,9 +1459,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_argb), // %0
@@ -1500,21 +1473,19 @@
);
}
-void BGRAToUVRow_NEON(const uint8* src_bgra,
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+ const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
"uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
@@ -1526,9 +1497,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_bgra), // %0
@@ -1542,21 +1511,19 @@
);
}
-void ABGRToUVRow_NEON(const uint8* src_abgr,
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+ const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@@ -1568,9 +1535,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_abgr), // %0
@@ -1584,21 +1549,19 @@
);
}
-void RGBAToUVRow_NEON(const uint8* src_rgba,
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+ const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
@@ -1610,9 +1573,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_rgba), // %0
@@ -1626,21 +1587,19 @@
);
}
-void RGB24ToUVRow_NEON(const uint8* src_rgb24,
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+ const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@@ -1652,9 +1611,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
@@ -1668,21 +1625,19 @@
);
}
-void RAWToUVRow_NEON(const uint8* src_raw,
+void RAWToUVRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_raw_1 = src_raw + src_stride_raw;
+ const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
- MEMACCESS(0)
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
- MEMACCESS(1)
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
"uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@@ -1694,9 +1649,7 @@
"subs %w4, %w4, #16 \n" // 32 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
- MEMACCESS(2)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_raw), // %0
@@ -1711,717 +1664,656 @@
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565,
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
- asm volatile (
- "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
- "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
- "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
- "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
- "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
- "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+ asm volatile(
+ "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
+ // 2
+ "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
+ "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
+ "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
+ "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
+ "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
- RGB565TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
- RGB565TOARGB
- "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v17.D[0] \n"
- "ins v18.D[1], v19.D[0] \n"
- "ins v20.D[1], v21.D[0] \n"
+ "ins v16.D[1], v17.D[0] \n"
+ "ins v18.D[1], v19.D[0] \n"
+ "ins v20.D[1], v21.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v18.8h, #1 \n"
- "urshr v6.8h, v20.8h, #1 \n"
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v18.8h, #1 \n"
+ "urshr v6.8h, v20.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v16.8h, v4.8h, v22.8h \n" // B
- "mls v16.8h, v5.8h, v23.8h \n" // G
- "mls v16.8h, v6.8h, v24.8h \n" // R
- "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
- "mul v17.8h, v6.8h, v22.8h \n" // R
- "mls v17.8h, v5.8h, v26.8h \n" // G
- "mls v17.8h, v4.8h, v25.8h \n" // B
- "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(src_rgb565_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
- "v25", "v26", "v27"
- );
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v16.8h, v4.8h, v22.8h \n" // B
+ "mls v16.8h, v5.8h, v23.8h \n" // G
+ "mls v16.8h, v6.8h, v24.8h \n" // R
+ "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
+ "mul v17.8h, v6.8h, v22.8h \n" // R
+ "mls v17.8h, v5.8h, v26.8h \n" // G
+ "mls v17.8h, v4.8h, v25.8h \n" // B
+ "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_rgb565_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+ "v27");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
int src_stride_argb1555,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
- RGB555TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
- RGB555TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(src_argb1555_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
- "v26", "v27", "v28"
- );
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_argb1555_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28");
}
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
int src_stride_argb4444,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
- const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
- asm volatile (
- RGBTOUV_SETUP_REG
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+ asm volatile(
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
- ARGB4444TOARGB
- "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
- "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
- "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
+ "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
+ "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
+ "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
- "ins v16.D[1], v26.D[0] \n"
- "ins v17.D[1], v27.D[0] \n"
- "ins v18.D[1], v28.D[0] \n"
+ "ins v16.D[1], v26.D[0] \n"
+ "ins v17.D[1], v27.D[0] \n"
+ "ins v18.D[1], v28.D[0] \n"
- "urshr v4.8h, v16.8h, #1 \n" // 2x average
- "urshr v5.8h, v17.8h, #1 \n"
- "urshr v6.8h, v18.8h, #1 \n"
+ "urshr v4.8h, v16.8h, #1 \n" // 2x average
+ "urshr v5.8h, v17.8h, #1 \n"
+ "urshr v6.8h, v18.8h, #1 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop.
- "mul v2.8h, v4.8h, v20.8h \n" // B
- "mls v2.8h, v5.8h, v21.8h \n" // G
- "mls v2.8h, v6.8h, v22.8h \n" // R
- "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
- "mul v3.8h, v6.8h, v20.8h \n" // R
- "mls v3.8h, v5.8h, v24.8h \n" // G
- "mls v3.8h, v4.8h, v23.8h \n" // B
- "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
- "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
- "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
- MEMACCESS(3)
- "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(src_argb4444_1), // %1
- "+r"(dst_u), // %2
- "+r"(dst_v), // %3
- "+r"(width) // %4
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
- "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
- "v26", "v27", "v28"
+ "subs %w4, %w4, #16 \n" // 16 processed per loop.
+ "mul v2.8h, v4.8h, v20.8h \n" // B
+ "mls v2.8h, v5.8h, v21.8h \n" // G
+ "mls v2.8h, v6.8h, v22.8h \n" // R
+ "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
+ "mul v3.8h, v6.8h, v20.8h \n" // R
+ "mls v3.8h, v5.8h, v24.8h \n" // G
+ "mls v3.8h, v4.8h, v23.8h \n" // B
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+ "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
+ "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_argb4444_1), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+ "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+ "v28"
);
}
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
- asm volatile (
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- RGB565TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgb565), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
- "v24", "v25", "v26", "v27"
- );
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+ "v27");
}
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB1555TOARGB
- "umull v3.8h, v0.8b, v4.8b \n" // B
- "umlal v3.8h, v1.8b, v5.8b \n" // G
- "umlal v3.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb1555), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "umull v3.8h, v0.8b, v4.8b \n" // B
+ "umlal v3.8h, v1.8b, v5.8b \n" // G
+ "umlal v3.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
- asm volatile (
- "movi v24.8b, #13 \n" // B * 0.1016 coefficient
- "movi v25.8b, #65 \n" // G * 0.5078 coefficient
- "movi v26.8b, #33 \n" // R * 0.2578 coefficient
- "movi v27.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- ARGB4444TOARGB
- "umull v3.8h, v0.8b, v24.8b \n" // B
- "umlal v3.8h, v1.8b, v25.8b \n" // G
- "umlal v3.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v27.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_argb4444), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
- );
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+ uint8_t* dst_y,
+ int width) {
+ asm volatile(
+ "movi v24.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v25.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v26.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v27.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "umull v3.8h, v0.8b, v24.8b \n" // B
+ "umlal v3.8h, v1.8b, v25.8b \n" // G
+ "umlal v3.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v27.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
}
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // R
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_bgra), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // R
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // R
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // B
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_abgr), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // R
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // B
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v1.8b, v4.8b \n" // B
- "umlal v16.8h, v2.8b, v5.8b \n" // G
- "umlal v16.8h, v3.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgba), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v1.8b, v4.8b \n" // B
+ "umlal v16.8h, v2.8b, v5.8b \n" // G
+ "umlal v16.8h, v3.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #13 \n" // B * 0.1016 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #33 \n" // R * 0.2578 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_rgb24), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
- asm volatile (
- "movi v4.8b, #33 \n" // R * 0.2578 coefficient
- "movi v5.8b, #65 \n" // G * 0.5078 coefficient
- "movi v6.8b, #13 \n" // B * 0.1016 coefficient
- "movi v7.8b, #16 \n" // Add 16 constant
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v16.8h, v0.8b, v4.8b \n" // B
- "umlal v16.8h, v1.8b, v5.8b \n" // G
- "umlal v16.8h, v2.8b, v6.8b \n" // R
- "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
- "uqadd v0.8b, v0.8b, v7.8b \n"
- MEMACCESS(1)
- "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_y), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+ asm volatile(
+ "movi v4.8b, #33 \n" // R * 0.2578 coefficient
+ "movi v5.8b, #65 \n" // G * 0.5078 coefficient
+ "movi v6.8b, #13 \n" // B * 0.1016 coefficient
+ "movi v7.8b, #16 \n" // Add 16 constant
+ "1: \n"
+ "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v0.8b, v4.8b \n" // B
+ "umlal v16.8h, v1.8b, v5.8b \n" // G
+ "umlal v16.8h, v2.8b, v6.8b \n" // R
+ "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
+ "uqadd v0.8b, v0.8b, v7.8b \n"
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
+ "b.gt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
- const uint8* src_ptr1 = src_ptr + src_stride;
- asm volatile (
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
- "dup v5.16b, %w4 \n"
- "dup v4.16b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v2.8h, v0.8b, v4.8b \n"
- "umull2 v3.8h, v0.16b, v4.16b \n"
- "umlal v2.8h, v1.8b, v5.8b \n"
- "umlal2 v3.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v2.8h, #8 \n"
- "rshrn2 v0.16b, v3.8h, #8 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "dup v5.16b, %w4 \n"
+ "dup v4.16b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v2.8h, v0.8b, v4.8b \n"
+ "umull2 v3.8h, v0.16b, v4.16b \n"
+ "umlal v2.8h, v1.8b, v5.8b \n"
+ "umlal2 v3.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v2.8h, #8 \n"
+ "rshrn2 v0.16b, v3.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_ptr1), // %2
- "+r"(dst_width), // %3
- "+r"(y1_fraction), // %4
- "+r"(y0_fraction) // %5
- :
- : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
- );
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(dst_width), // %3
+ "+r"(y1_fraction), // %4
+ "+r"(y0_fraction) // %5
+ :
+ : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "subs %w3, %w3, #8 \n"
- "b.lt 89f \n"
- // Blend 8 pixels.
- "8: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.ge 8b \n"
+ asm volatile(
+ "subs %w3, %w3, #8 \n"
+ "b.lt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
+ // pixels
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
+ // pixels
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ // pixels
+ "b.ge 8b \n"
- "89: \n"
- "adds %w3, %w3, #8-1 \n"
- "b.lt 99f \n"
+ "89: \n"
+ "adds %w3, %w3, #8-1 \n"
+ "b.lt 99f \n"
- // Blend 1 pixels.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
- MEMACCESS(1)
- "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
- "subs %w3, %w3, #1 \n" // 1 processed per loop.
- "umull v16.8h, v4.8b, v3.8b \n" // db * a
- "umull v17.8h, v5.8b, v3.8b \n" // dg * a
- "umull v18.8h, v6.8b, v3.8b \n" // dr * a
- "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
- "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
- "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
- "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
- "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
- "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
- "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
- "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
- "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
- "movi v3.8b, #255 \n" // a = 255
- MEMACCESS(2)
- "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
- "b.ge 1b \n"
+ // Blend 1 pixels.
+ "1: \n"
+ "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
+ "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
+ "subs %w3, %w3, #1 \n" // 1 processed per loop.
+ "umull v16.8h, v4.8b, v3.8b \n" // db * a
+ "umull v17.8h, v5.8b, v3.8b \n" // dg * a
+ "umull v18.8h, v6.8b, v3.8b \n" // dr * a
+ "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
+ "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
+ "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
+ "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
+ "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
+ "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
+ "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
+ "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
+ "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
+ "movi v3.8b, #255 \n" // a = 255
+ "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
+ "b.ge 1b \n"
- "99: \n"
+ "99: \n"
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v16", "v17", "v18"
- );
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18");
}
// Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- // Attenuate 8 pixels.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v3.8b \n" // b * a
- "umull v5.8h, v1.8b, v3.8b \n" // g * a
- "umull v6.8h, v2.8b, v3.8b \n" // r * a
- "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
- "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
- "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
- MEMACCESS(1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ int width) {
+ asm volatile(
+ // Attenuate 8 pixels.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v3.8b \n" // b * a
+ "umull v5.8h, v1.8b, v3.8b \n" // g * a
+ "umull v6.8h, v2.8b, v3.8b \n" // r * a
+ "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
+ "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
+ "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
+ // pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb,
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
int width) {
- asm volatile (
- "dup v4.8h, %w2 \n"
- "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
- "dup v5.8h, %w3 \n" // interval multiply.
- "dup v6.8h, %w4 \n" // interval add
+ asm volatile(
+ "dup v4.8h, %w2 \n"
+ "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
+ "dup v5.8h, %w3 \n" // interval multiply.
+ "dup v6.8h, %w4 \n" // interval add
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
- "uxtl v1.8h, v1.8b \n"
- "uxtl v2.8h, v2.8b \n"
- "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
- "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
- "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
- "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
- "mul v1.8h, v1.8h, v5.8h \n" // g
- "mul v2.8h, v2.8h, v5.8h \n" // r
- "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
- "add v1.8h, v1.8h, v6.8h \n" // g
- "add v2.8h, v2.8h, v6.8h \n" // r
- "uqxtn v0.8b, v0.8h \n"
- "uqxtn v1.8b, v1.8h \n"
- "uqxtn v2.8b, v2.8h \n"
- MEMACCESS(0)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(scale), // %2
- "r"(interval_size), // %3
- "r"(interval_offset) // %4
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
- );
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
+ "uxtl v1.8h, v1.8b \n"
+ "uxtl v2.8h, v2.8b \n"
+ "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
+ "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
+ "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
+ "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
+ "mul v1.8h, v1.8h, v5.8h \n" // g
+ "mul v2.8h, v2.8h, v5.8h \n" // r
+ "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
+ "add v1.8h, v1.8h, v6.8h \n" // g
+ "add v2.8h, v2.8h, v6.8h \n" // r
+ "uqxtn v0.8b, v0.8h \n"
+ "uqxtn v1.8b, v1.8h \n"
+ "uqxtn v2.8b, v2.8h \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
- asm volatile (
- "dup v0.4s, %w3 \n" // duplicate scale value.
- "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
- "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
+ uint32_t value) {
+ asm volatile(
+ "dup v0.4s, %w3 \n" // duplicate scale value.
+ "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
+ "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
- "uxtl v5.8h, v5.8b \n"
- "uxtl v6.8h, v6.8b \n"
- "uxtl v7.8h, v7.8b \n"
- "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
- "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
- "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
- "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
- "uqxtn v4.8b, v4.8h \n"
- "uqxtn v5.8b, v5.8h \n"
- "uqxtn v6.8b, v6.8h \n"
- "uqxtn v7.8b, v7.8h \n"
- MEMACCESS(1)
- "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(value) // %3
- : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
- );
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
+ "uxtl v5.8h, v5.8b \n"
+ "uxtl v6.8h, v6.8b \n"
+ "uxtl v7.8h, v7.8b \n"
+ "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
+ "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
+ "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
+ "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
+ "uqxtn v4.8b, v4.8h \n"
+ "uqxtn v5.8b, v5.8h \n"
+ "uqxtn v6.8b, v6.8h \n"
+ "uqxtn v7.8b, v7.8h \n"
+ "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
}
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
- asm volatile (
- "movi v24.8b, #15 \n" // B * 0.11400 coefficient
- "movi v25.8b, #75 \n" // G * 0.58700 coefficient
- "movi v26.8b, #38 \n" // R * 0.29900 coefficient
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v24.8b \n" // B
- "umlal v4.8h, v1.8b, v25.8b \n" // G
- "umlal v4.8h, v2.8b, v26.8b \n" // R
- "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
- "orr v1.8b, v0.8b, v0.8b \n" // G
- "orr v2.8b, v0.8b, v0.8b \n" // R
- MEMACCESS(1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
- );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v24.8b, #15 \n" // B * 0.11400 coefficient
+ "movi v25.8b, #75 \n" // G * 0.58700 coefficient
+ "movi v26.8b, #38 \n" // R * 0.29900 coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v24.8b \n" // B
+ "umlal v4.8h, v1.8b, v25.8b \n" // G
+ "umlal v4.8h, v2.8b, v26.8b \n" // R
+ "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
+ "orr v1.8b, v0.8b, v0.8b \n" // G
+ "orr v2.8b, v0.8b, v0.8b \n" // R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
}
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
@@ -2429,202 +2321,180 @@
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
- asm volatile (
- "movi v20.8b, #17 \n" // BB coefficient
- "movi v21.8b, #68 \n" // BG coefficient
- "movi v22.8b, #35 \n" // BR coefficient
- "movi v24.8b, #22 \n" // GB coefficient
- "movi v25.8b, #88 \n" // GG coefficient
- "movi v26.8b, #45 \n" // GR coefficient
- "movi v28.8b, #24 \n" // BB coefficient
- "movi v29.8b, #98 \n" // BG coefficient
- "movi v30.8b, #50 \n" // BR coefficient
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
- "subs %w1, %w1, #8 \n" // 8 processed per loop.
- "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
- "umlal v4.8h, v1.8b, v21.8b \n" // G
- "umlal v4.8h, v2.8b, v22.8b \n" // R
- "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
- "umlal v5.8h, v1.8b, v25.8b \n" // G
- "umlal v5.8h, v2.8b, v26.8b \n" // R
- "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
- "umlal v6.8h, v1.8b, v29.8b \n" // G
- "umlal v6.8h, v2.8b, v30.8b \n" // R
- "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
- "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
- "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
- MEMACCESS(0)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
- "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
- );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+ asm volatile(
+ "movi v20.8b, #17 \n" // BB coefficient
+ "movi v21.8b, #68 \n" // BG coefficient
+ "movi v22.8b, #35 \n" // BR coefficient
+ "movi v24.8b, #22 \n" // GB coefficient
+ "movi v25.8b, #88 \n" // GG coefficient
+ "movi v26.8b, #45 \n" // GR coefficient
+ "movi v28.8b, #24 \n" // BB coefficient
+ "movi v29.8b, #98 \n" // BG coefficient
+ "movi v30.8b, #50 \n" // BR coefficient
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
+ "subs %w1, %w1, #8 \n" // 8 processed per loop.
+ "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
+ "umlal v4.8h, v1.8b, v21.8b \n" // G
+ "umlal v4.8h, v2.8b, v22.8b \n" // R
+ "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
+ "umlal v5.8h, v1.8b, v25.8b \n" // G
+ "umlal v5.8h, v2.8b, v26.8b \n" // R
+ "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
+ "umlal v6.8h, v1.8b, v29.8b \n" // G
+ "umlal v6.8h, v2.8b, v30.8b \n" // R
+ "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
+ "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
+ "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
}
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
- asm volatile (
- MEMACCESS(3)
- "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
- "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
- "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
+ asm volatile(
+ "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
+ "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
+ "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
- "uxtl v17.8h, v17.8b \n" // g
- "uxtl v18.8h, v18.8b \n" // r
- "uxtl v19.8h, v19.8b \n" // a
- "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
- "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
- "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
- "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
- "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
- "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
- "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
- "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
- "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
- "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
- "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
- "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
- "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
- "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
- "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
- "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
- "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
- "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
- "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
- "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
- "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
- "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
- MEMACCESS(1)
- "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(width) // %2
- : "r"(matrix_argb) // %3
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v22", "v23", "v24", "v25"
- );
+ "1: \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
+ "subs %w2, %w2, #8 \n" // 8 processed per loop.
+ "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
+ "uxtl v17.8h, v17.8b \n" // g
+ "uxtl v18.8h, v18.8b \n" // r
+ "uxtl v19.8h, v19.8b \n" // a
+ "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
+ "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
+ "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
+ "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
+ "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
+ "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
+ "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
+ "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
+ "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
+ "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
+ "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
+ "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
+ "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
+ "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
+ "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
+ "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
+ "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
+ "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
+ "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
+ "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
+ "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
+ "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
+ "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+ "v17", "v18", "v19", "v22", "v23", "v24", "v25");
}
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "umull v0.8h, v0.8b, v4.8b \n" // multiply B
- "umull v1.8h, v1.8b, v5.8b \n" // multiply G
- "umull v2.8h, v2.8b, v6.8b \n" // multiply R
- "umull v3.8h, v3.8b, v7.8b \n" // multiply A
- "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
- "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
- "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
- "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "umull v0.8h, v0.8b, v4.8b \n" // multiply B
+ "umull v1.8h, v1.8b, v5.8b \n" // multiply G
+ "umull v2.8h, v2.8b, v6.8b \n" // multiply R
+ "umull v3.8h, v3.8b, v7.8b \n" // multiply A
+ "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
+ "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
+ "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
+ "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v4.8b \n"
- "uqadd v1.8b, v1.8b, v5.8b \n"
- "uqadd v2.8b, v2.8b, v6.8b \n"
- "uqadd v3.8b, v3.8b, v7.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v4.8b \n"
+ "uqadd v1.8b, v1.8b, v5.8b \n"
+ "uqadd v2.8b, v2.8b, v6.8b \n"
+ "uqadd v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
- MEMACCESS(1)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqsub v0.8b, v0.8b, v4.8b \n"
- "uqsub v1.8b, v1.8b, v5.8b \n"
- "uqsub v2.8b, v2.8b, v6.8b \n"
- "uqsub v3.8b, v3.8b, v7.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
-
- : "+r"(src_argb0), // %0
- "+r"(src_argb1), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
- );
+ asm volatile(
+ // 8 pixel loop.
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqsub v0.8b, v0.8b, v4.8b \n"
+ "uqsub v1.8b, v1.8b, v5.8b \n"
+ "uqsub v2.8b, v2.8b, v6.8b \n"
+ "uqsub v3.8b, v3.8b, v7.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2632,58 +2502,50 @@
// R = Sobel
// G = Sobel
// B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v0.8b, v0.8b, v1.8b \n" // add
- "orr v1.8b, v0.8b, v0.8b \n"
- "orr v2.8b, v0.8b, v0.8b \n"
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v0.8b, v0.8b, v1.8b \n" // add
+ "orr v1.8b, v0.8b, v0.8b \n"
+ "orr v2.8b, v0.8b, v0.8b \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
// Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
- asm volatile (
- // 16 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
- "subs %w3, %w3, #16 \n" // 16 processed per loop.
- "uqadd v0.16b, v0.16b, v1.16b \n" // add
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_y), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1"
- );
+ asm volatile(
+ // 16 pixel loop.
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
+ "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "uqadd v0.16b, v0.16b, v1.16b \n" // add
+ "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1");
}
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2691,75 +2553,64 @@
// R = Sobel X
// G = Sobel
// B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
- asm volatile (
- "movi v3.8b, #255 \n" // alpha
- // 8 pixel loop.
- "1: \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
- MEMACCESS(1)
- "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uqadd v1.8b, v0.8b, v2.8b \n" // add
- MEMACCESS(2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
- "b.gt 1b \n"
- : "+r"(src_sobelx), // %0
- "+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
- :
- : "cc", "memory", "v0", "v1", "v2", "v3"
- );
+ asm volatile(
+ "movi v3.8b, #255 \n" // alpha
+ // 8 pixel loop.
+ "1: \n"
+ "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
+ "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uqadd v1.8b, v0.8b, v2.8b \n" // add
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
+ "b.gt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
}
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+void SobelXRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0],%5 \n" // top
- MEMACCESS(0)
- "ld1 {v1.8b}, [%0],%6 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- MEMACCESS(1)
- "ld1 {v2.8b}, [%1],%5 \n" // center * 2
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%6 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- MEMACCESS(2)
- "ld1 {v2.8b}, [%2],%5 \n" // bottom
- MEMACCESS(2)
- "ld1 {v3.8b}, [%2],%6 \n"
- "subs %w4, %w4, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- MEMACCESS(3)
- "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
- "b.gt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(src_y2), // %2
- "+r"(dst_sobelx), // %3
- "+r"(width) // %4
- : "r"(2LL), // %5
- "r"(6LL) // %6
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%5 \n" // top
+ "ld1 {v1.8b}, [%0],%6 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%1],%5 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%6 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%2],%5 \n" // bottom
+ "ld1 {v3.8b}, [%2],%6 \n"
+ "subs %w4, %w4, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2LL), // %5
+ "r"(6LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
@@ -2767,93 +2618,414 @@
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_NEON(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+void SobelYRow_NEON(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0],%4 \n" // left
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1],%4 \n"
- "usubl v0.8h, v0.8b, v1.8b \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0],%4 \n" // center * 2
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%4 \n"
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "add v0.8h, v0.8h, v1.8h \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0],%5 \n" // right
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1],%5 \n"
- "subs %w3, %w3, #8 \n" // 8 pixels
- "usubl v1.8h, v2.8b, v3.8b \n"
- "add v0.8h, v0.8h, v1.8h \n"
- "abs v0.8h, v0.8h \n"
- "uqxtn v0.8b, v0.8h \n"
- MEMACCESS(2)
- "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
- "b.gt 1b \n"
- : "+r"(src_y0), // %0
- "+r"(src_y1), // %1
- "+r"(dst_sobely), // %2
- "+r"(width) // %3
- : "r"(1LL), // %4
- "r"(6LL) // %5
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.8b}, [%0],%4 \n" // left
+ "ld1 {v1.8b}, [%1],%4 \n"
+ "usubl v0.8h, v0.8b, v1.8b \n"
+ "ld1 {v2.8b}, [%0],%4 \n" // center * 2
+ "ld1 {v3.8b}, [%1],%4 \n"
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "ld1 {v2.8b}, [%0],%5 \n" // right
+ "ld1 {v3.8b}, [%1],%5 \n"
+ "subs %w3, %w3, #8 \n" // 8 pixels
+ "usubl v1.8h, v2.8b, v3.8b \n"
+ "add v0.8h, v0.8h, v1.8h \n"
+ "abs v0.8h, v0.8h \n"
+ "uqxtn v0.8b, v0.8h \n"
+ "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
+ "b.gt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1LL), // %4
+ "r"(6LL) // %5
+ : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fcvtn v1.4h, v2.4s \n" // 8 half floats
- "fcvtn2 v1.8h, v3.4s \n"
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v1", "v2", "v3"
- );
+void HalfFloat1Row_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float /*unused*/,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fcvtn v1.4h, v2.4s \n" // 8 half floats
+ "fcvtn2 v1.8h, v3.4s \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v1", "v2", "v3");
}
-void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
- "subs %w2, %w2, #8 \n" // 8 pixels per loop
- "uxtl v2.4s, v1.4h \n" // 8 int's
- "uxtl2 v3.4s, v1.8h \n"
- "scvtf v2.4s, v2.4s \n" // 8 floats
- "scvtf v3.4s, v3.4s \n"
- "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
- "fmul v3.4s, v3.4s, %3.s[0] \n"
- "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
- "uqshrn2 v1.8h, v3.4s, #13 \n"
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
- "b.gt 1b \n"
- : "+r"(src), // %0
- "+r"(dst), // %1
- "+r"(width) // %2
- : "w"(scale * 1.9259299444e-34f) // %3
- : "cc", "memory", "v1", "v2", "v3"
- );
+void HalfFloatRow_NEON(const uint16_t* src,
+ uint16_t* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v2.4s, v1.4h \n" // 8 int's
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
+ "uqshrn2 v1.8h, v3.4s, #13 \n"
+ "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale * 1.9259299444e-34f) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+ float* dst,
+ float scale,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
+ "subs %w2, %w2, #8 \n" // 8 pixels per loop
+ "uxtl v1.8h, v1.8b \n" // 8 shorts
+ "uxtl v2.4s, v1.4h \n" // 8 ints
+ "uxtl2 v3.4s, v1.8h \n"
+ "scvtf v2.4s, v2.4s \n" // 8 floats
+ "scvtf v3.4s, v3.4s \n"
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "fmul v3.4s, v3.4s, %3.s[0] \n"
+ "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fmax;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
+ "fmax v5.4s, v5.4s, v1.4s \n" // max
+ "fmax v6.4s, v6.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "fmax v5.4s, v5.4s, v6.4s \n" // max
+ "fmaxv %s3, v5.4s \n" // signed max acculator
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fmax) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+ float* dst,
+ float scale,
+ int width) {
+ float fsum;
+ asm volatile(
+ "movi v5.4s, #0 \n" // max
+ "movi v6.4s, #0 \n" // max
+
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
+ "fmul v4.4s, v2.4s, %4.s[0] \n"
+ "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
+ "fmla v6.4s, v2.4s, v2.4s \n"
+ "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ "faddp v5.4s, v5.4s, v6.4s \n"
+ "faddp v5.4s, v5.4s, v5.4s \n"
+ "faddp %3.4s, v5.4s, v5.4s \n" // sum
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width), // %2
+ "=w"(fsum) // %3
+ : "w"(scale) // %4
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+ return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
+ "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
+ "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "w"(scale) // %3
+ : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width) {
+ asm volatile(
+ "movi v6.8h, #4 \n" // constant 4
+ "movi v7.8h, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
+ "ld1 {v2.8h}, [%4], #16 \n"
+ "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
+ "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
+ "ld1 {v2.8h}, [%1], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "ld1 {v2.8h}, [%2], #16 \n"
+ "umlal v0.4s, v2.4h, v7.4h \n" // * 6
+ "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
+ "ld1 {v2.8h}, [%3], #16 \n"
+ "umlal v0.4s, v2.4h, v6.4h \n" // * 4
+ "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
+ "subs %w6, %w6, #8 \n" // 8 processed per loop
+ "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src0), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(src4), // %4
+ "+r"(dst), // %5
+ "+r"(width) // %6
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+ const uint32_t* src1 = src + 1;
+ const uint32_t* src2 = src + 2;
+ const uint32_t* src3 = src + 3;
+ asm volatile(
+ "movi v6.4s, #4 \n" // constant 4
+ "movi v7.4s, #6 \n" // constant 6
+
+ "1: \n"
+ "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
+ "add v0.4s, v0.4s, v1.4s \n" // * 1
+ "add v1.4s, v1.4s, v2.4s \n" // * 1
+ "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
+ "mla v0.4s, v2.4s, v7.4s \n" // * 6
+ "mla v1.4s, v3.4s, v7.4s \n" // * 6
+ "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
+ "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
+ "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
+ "add v3.4s, v3.4s, v5.4s \n"
+ "mla v0.4s, v2.4s, v6.4s \n" // * 4
+ "mla v1.4s, v3.4s, v6.4s \n" // * 4
+ "subs %w5, %w5, #8 \n" // 8 processed per loop
+ "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
+ "uqrshrn2 v0.8h, v1.4s, #8 \n"
+ "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
+ "b.gt 1b \n"
+ : "+r"(src), // %0
+ "+r"(src1), // %1
+ "+r"(src2), // %2
+ "+r"(src3), // %3
+ "+r"(dst), // %4
+ "+r"(width) // %5
+ : "r"(32LL) // %6
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+ const uint8_t* src_vu,
+ uint8_t* dst_yuv24,
+ int width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
+ "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
+ "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
+ "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
+ "subs %w3, %w3, #16 \n" // 16 pixels per loop
+ "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_vu), // %1
+ "+r"(dst_yuv24), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_uv,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v2.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+ int src_stride_ayuv,
+ uint8_t* dst_vu,
+ int width) {
+ const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+ asm volatile(
+
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels.
+ "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
+ "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
+ "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
+ "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
+ "uqrshrn v1.8b, v1.8h, #2 \n"
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(src_ayuv_1), // %1
+ "+r"(dst_vu), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
+ // pixels
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
+ "b.gt 1b \n"
+ : "+r"(src_ayuv), // %0
+ "+r"(dst_y), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void FloatDivToByteRow_NEON(const float* src_weights,
+ const float* src_values,
+ uint8_t* dst_out,
+ uint8_t* dst_mask,
+ int width) {
+ asm volatile(
+ "movi v0.4s, #0 \n"
+
+ "1: \n"
+ "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
+ "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
+ "subs %w4, %w4, #8 \n" // 8 pixels per loop
+
+ "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
+ "fdiv v2.4s, v4.4s, v2.4s \n"
+
+ "fcvtas v1.4s, v1.4s \n" // float to int
+ "fcvtas v2.4s, v2.4s \n" // float to int
+ "uqxtn v1.4h, v1.4s \n" // 8 shorts
+ "uqxtn2 v1.8h, v2.4s \n"
+ "uqxtn v1.8b, v1.8h \n" // 8 bytes
+
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
+
+ "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
+ "fcmgt v6.4s, v2.4s, v0.4s \n"
+ "uqxtn v5.4h, v5.4s \n" // 8 shorts
+ "uqxtn2 v5.8h, v6.4s \n"
+ "uqxtn v5.8b, v1.8h \n" // 8 bytes
+
+ "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
+
+ "b.gt 1b \n"
+ : "+r"(src_weights), // %0
+ "+r"(src_values), // %1
+ "+r"(dst_out), // %2
+ "+r"(dst_mask), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+ asm volatile(
+ "1: \n"
+ "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
+ "orr v2.16b, v0.16b, v0.16b \n" // move U after V
+ "subs %w2, %w2, #16 \n" // 16 pixels per loop
+ "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
+ "b.gt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_vu), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "v0", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/files/source/row_win.cc b/files/source/row_win.cc
index 202f2b8..27e3da7 100644
--- a/files/source/row_win.cc
+++ b/files/source/row_win.cc
@@ -28,27 +28,27 @@
#if defined(_M_X64)
// Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+#define READYUV422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
y_buf += 8;
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
-#define READYUVA422 \
- xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
- xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
- xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
- xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
- u_buf += 4; \
- xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
- xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
- y_buf += 8; \
- xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
+#define READYUVA422 \
+ xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \
+ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
+ u_buf += 4; \
+ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
+ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
+ y_buf += 8; \
+ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
a_buf += 8;
// Convert 8 pixels: 8 UV and 8 Y.
@@ -84,15 +84,15 @@
dst_argb += 32;
#if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4;
const __m128i xmm5 = _mm_set1_epi8(-1);
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUV422
YUVTORGB(yuvconstants)
@@ -103,15 +103,15 @@
#endif
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__m128i xmm0, xmm1, xmm2, xmm4, xmm5;
- const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+ const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
while (width > 0) {
READYUVA422
YUVTORGB(yuvconstants)
@@ -255,8 +255,8 @@
};
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
- uint8* dst_argb,
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_y
@@ -285,8 +285,8 @@
#ifdef HAS_J400TOARGBROW_AVX2
// Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
- uint8* dst_argb,
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_y
@@ -316,8 +316,8 @@
}
#endif // HAS_J400TOARGBROW_AVX2
-__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
- uint8* dst_argb,
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_rgb24
@@ -355,8 +355,8 @@
}
}
-__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
- uint8* dst_argb,
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_raw
@@ -394,8 +394,8 @@
}
}
-__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
- uint8* dst_rgb24,
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+ uint8_t* dst_rgb24,
int width) {
__asm {
mov eax, [esp + 4] // src_raw
@@ -430,8 +430,8 @@
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
// 20 instructions.
-__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
- uint8* dst_argb,
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -486,8 +486,8 @@
// v * 256 + v * 8
// v * (256 + 8)
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
- uint8* dst_argb,
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -537,8 +537,8 @@
#endif // HAS_RGB565TOARGBROW_AVX2
#ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
- uint8* dst_argb,
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -589,8 +589,8 @@
#endif // HAS_ARGB1555TOARGBROW_AVX2
#ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
- uint8* dst_argb,
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
@@ -627,8 +627,8 @@
#endif // HAS_ARGB4444TOARGBROW_AVX2
// 24 instructions
-__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
- uint8* dst_argb,
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x01080108 // generate multiplier to repeat 5 bits
@@ -680,8 +680,8 @@
}
// 18 instructions.
-__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
- uint8* dst_argb,
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
@@ -718,8 +718,8 @@
}
}
-__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -757,8 +757,8 @@
}
}
-__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -796,8 +796,8 @@
}
}
-__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -834,9 +834,9 @@
}
}
-__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
__asm {
@@ -881,9 +881,9 @@
}
#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
- const uint32 dither4,
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
+ const uint32_t dither4,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -925,8 +925,8 @@
#endif // HAS_ARGBTORGB565DITHERROW_AVX2
// TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -967,8 +967,8 @@
}
}
-__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -998,8 +998,8 @@
}
#ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1036,8 +1036,8 @@
#endif // HAS_ARGBTORGB565ROW_AVX2
#ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1077,8 +1077,8 @@
#endif // HAS_ARGBTOARGB1555ROW_AVX2
#ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
- uint8* dst_rgb,
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_rgb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1109,8 +1109,8 @@
#endif // HAS_ARGBTOARGB4444ROW_AVX2
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1145,8 +1145,8 @@
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1185,8 +1185,8 @@
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1225,8 +1225,8 @@
#ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1265,8 +1265,8 @@
}
#endif // HAS_ARGBTOYJROW_AVX2
-__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1299,8 +1299,8 @@
}
}
-__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1333,8 +1333,8 @@
}
}
-__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
- uint8* dst_y,
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -1367,10 +1367,10 @@
}
}
-__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1410,9 +1410,9 @@
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1426,7 +1426,7 @@
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1439,10 +1439,10 @@
}
}
-__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1482,9 +1482,9 @@
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1499,7 +1499,7 @@
psraw xmm1, 8
packsswb xmm0, xmm1
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1513,10 +1513,10 @@
}
#ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1549,9 +1549,9 @@
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@@ -1565,7 +1565,7 @@
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned
- // step 3 - store 16 U and 16 V values
+ // step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@@ -1581,10 +1581,10 @@
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1617,9 +1617,9 @@
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@@ -1634,7 +1634,7 @@
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
- // step 3 - store 16 U and 16 V values
+ // step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@@ -1649,9 +1649,9 @@
}
#endif // HAS_ARGBTOUVJROW_AVX2
-__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -1707,10 +1707,10 @@
}
}
-__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1750,9 +1750,9 @@
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1766,7 +1766,7 @@
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1779,10 +1779,10 @@
}
}
-__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1822,9 +1822,9 @@
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1838,7 +1838,7 @@
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -1851,10 +1851,10 @@
}
}
-__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
int src_stride_argb,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -1894,9 +1894,9 @@
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
- // step 2 - convert to U and V
- // from here down is very similar to Y code except
- // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@@ -1910,7 +1910,7 @@
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
- // step 3 - store 8 U and 8 V values
+ // step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@@ -2065,10 +2065,10 @@
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void I422ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2105,11 +2105,11 @@
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
__declspec(naked) void I422AlphaToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2148,10 +2148,10 @@
// 16 pixels
// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void I444ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2187,9 +2187,9 @@
// 16 pixels.
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void NV12ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2222,9 +2222,9 @@
// 16 pixels.
// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) void NV21ToARGBRow_AVX2(
- const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2257,8 +2257,8 @@
// 16 pixels.
// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked) void YUY2ToARGBRow_AVX2(
- const uint8* src_yuy2,
- uint8* dst_argb,
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2288,8 +2288,8 @@
// 16 pixels.
// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
__declspec(naked) void UYVYToARGBRow_AVX2(
- const uint8* src_uyvy,
- uint8* dst_argb,
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2319,10 +2319,10 @@
// 16 pixels
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
__declspec(naked) void I422ToRGBARow_AVX2(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2551,10 +2551,10 @@
// 8 pixels.
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void I444ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2588,10 +2588,10 @@
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
__declspec(naked) void I422ToRGB24Row_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgb24,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2626,10 +2626,10 @@
// 8 pixels
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
__declspec(naked) void I422ToRGB565Row_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* rgb565_buf,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* rgb565_buf,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2669,10 +2669,10 @@
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void I422ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2706,11 +2706,11 @@
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
__declspec(naked) void I422AlphaToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- const uint8* a_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ const uint8_t* a_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2746,9 +2746,9 @@
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void NV12ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* uv_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* uv_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2778,9 +2778,9 @@
// 8 pixels.
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
__declspec(naked) void NV21ToARGBRow_SSSE3(
- const uint8* y_buf,
- const uint8* vu_buf,
- uint8* dst_argb,
+ const uint8_t* y_buf,
+ const uint8_t* vu_buf,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2810,8 +2810,8 @@
// 8 pixels.
// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked) void YUY2ToARGBRow_SSSE3(
- const uint8* src_yuy2,
- uint8* dst_argb,
+ const uint8_t* src_yuy2,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2838,8 +2838,8 @@
// 8 pixels.
// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
__declspec(naked) void UYVYToARGBRow_SSSE3(
- const uint8* src_uyvy,
- uint8* dst_argb,
+ const uint8_t* src_uyvy,
+ uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2864,10 +2864,10 @@
}
__declspec(naked) void I422ToRGBARow_SSSE3(
- const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_rgba,
+ const uint8_t* y_buf,
+ const uint8_t* u_buf,
+ const uint8_t* v_buf,
+ uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
__asm {
@@ -2900,8 +2900,8 @@
#ifdef HAS_I400TOARGBROW_SSE2
// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
- uint8* rgb_buf,
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2927,7 +2927,7 @@
psrlw xmm0, 6
packuswb xmm0, xmm0 // G
- // Step 2: Weave into ARGB
+ // Step 2: Weave into ARGB
punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
@@ -2947,8 +2947,8 @@
#ifdef HAS_I400TOARGBROW_AVX2
// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
// note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
- uint8* rgb_buf,
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+ uint8_t* rgb_buf,
int width) {
__asm {
mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
@@ -2975,8 +2975,8 @@
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
- // TODO(fbarchard): Weave alpha with unpack.
- // Step 2: Weave into ARGB
+ // TODO(fbarchard): Weave alpha with unpack.
+ // Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
@@ -3000,8 +3000,8 @@
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
// TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked) void MirrorRow_SSSE3(const uint8* src,
- uint8* dst,
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3022,7 +3022,9 @@
#endif // HAS_MIRRORROW_SSSE3
#ifdef HAS_MIRRORROW_AVX2
-__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
@@ -3048,9 +3050,9 @@
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
-__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3079,8 +3081,8 @@
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3105,8 +3107,8 @@
// Shuffle table for reversing the bytes.
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
-__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
@@ -3127,9 +3129,9 @@
#endif // HAS_ARGBMIRRORROW_AVX2
#ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3167,9 +3169,9 @@
#endif // HAS_SPLITUVROW_SSE2
#ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3207,9 +3209,9 @@
#endif // HAS_SPLITUVROW_AVX2
#ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
__asm {
push edi
@@ -3239,9 +3241,9 @@
#endif // HAS_MERGEUVROW_SSE2
#ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
- const uint8* src_v,
- uint8* dst_uv,
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_uv,
int width) {
__asm {
push edi
@@ -3273,12 +3275,14 @@
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
test eax, 15
jne convertloopu
test edx, 15
@@ -3310,12 +3314,14 @@
#endif // HAS_COPYROW_SSE2
#ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
convertloop:
vmovdqu ymm0, [eax]
@@ -3334,13 +3340,15 @@
#endif // HAS_COPYROW_AVX
// Multiple of 1.
-__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+ uint8_t* dst,
+ int width) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep movsb
mov edi, edx
mov esi, eax
@@ -3350,13 +3358,13 @@
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
@@ -3387,13 +3395,13 @@
#ifdef HAS_ARGBCOPYALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
@@ -3417,8 +3425,8 @@
#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
- uint8* dst_a,
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_a,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -3445,8 +3453,8 @@
#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
- uint8* dst_a,
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_a,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -3481,13 +3489,13 @@
#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
pcmpeqb xmm0, xmm0 // generate mask 0xff000000
pslld xmm0, 24
pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
@@ -3520,13 +3528,13 @@
#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
// width in pixels
-__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
- uint8* dst,
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+ uint8_t* dst,
int width) {
__asm {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
vpcmpeqb ymm0, ymm0, ymm0
vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
@@ -3551,16 +3559,16 @@
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
__asm {
movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
mul edx // overwrites edx with upper part of result.
mov edx, edi
mov edi, [esp + 4] // dst
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
shr ecx, 2
rep stosd
mov edi, edx
@@ -3568,26 +3576,28 @@
}
}
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v8
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep stosb
mov edi, edx
ret
}
}
-// Write 'count' 32 bit values.
-__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+ uint32_t v32,
+ int width) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
- mov ecx, [esp + 12] // count
+ mov ecx, [esp + 12] // width
rep stosd
mov edi, edx
ret
@@ -3596,8 +3606,8 @@
#endif // HAS_SETROW_X86
#ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
- uint8* dst_y,
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_yuy2
@@ -3623,10 +3633,10 @@
}
}
-__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3669,9 +3679,9 @@
}
}
-__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3709,8 +3719,8 @@
}
}
-__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
- uint8* dst_y,
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_uyvy
@@ -3734,10 +3744,10 @@
}
}
-__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3780,9 +3790,9 @@
}
}
-__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3822,8 +3832,8 @@
#endif // HAS_YUY2TOYROW_AVX2
#ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
- uint8* dst_y,
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_yuy2
@@ -3847,10 +3857,10 @@
}
}
-__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3892,9 +3902,9 @@
}
}
-__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -3929,8 +3939,8 @@
}
}
-__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
- uint8* dst_y,
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_y,
int width) {
__asm {
mov eax, [esp + 4] // src_uyvy
@@ -3952,10 +3962,10 @@
}
}
-__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
int stride_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push esi
@@ -3997,9 +4007,9 @@
}
}
-__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
- uint8* dst_u,
- uint8* dst_v,
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int width) {
__asm {
push edi
@@ -4041,10 +4051,10 @@
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
__asm {
push esi
@@ -4067,7 +4077,7 @@
sub edx, esi
sub edi, esi
- // 8 pixel loop.
+ // 8 pixel loop.
convertloop8:
movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
@@ -4098,10 +4108,10 @@
// =((A2*C2)+(B2*(255-C2))+255)/256
// signed version of math
// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
- const uint8* src1,
- const uint8* alpha,
- uint8* dst,
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+ const uint8_t* src1,
+ const uint8_t* alpha,
+ uint8_t* dst,
int width) {
__asm {
push esi
@@ -4123,7 +4133,7 @@
sub edx, esi
sub edi, esi
- // 32 pixel loop.
+ // 32 pixel loop.
convertloop32:
vmovdqu ymm0, [esi] // alpha
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
@@ -4162,9 +4172,9 @@
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
-__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4183,7 +4193,7 @@
sub ecx, 4
jl convertloop4b // less than 4 pixels?
- // 4 pixel loop.
+ // 4 pixel loop.
convertloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
@@ -4212,7 +4222,7 @@
add ecx, 4 - 1
jl convertloop1b
- // 1 pixel loop.
+ // 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
@@ -4253,8 +4263,8 @@
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
-__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4298,8 +4308,8 @@
static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
128u, 128u, 14u, 15u, 14u, 15u,
14u, 15u, 128u, 128u};
-__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4336,8 +4346,8 @@
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
// Unattenuate 4 pixels at a time.
-__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
push ebx
@@ -4392,8 +4402,8 @@
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
// USE_GATHER is not on by default, due to being a slow instruction.
#ifdef USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
@@ -4426,8 +4436,8 @@
}
}
#else // USE_GATHER
-__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
@@ -4495,8 +4505,8 @@
#ifdef HAS_ARGBGRAYROW_SSSE3
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -4552,7 +4562,7 @@
24, 98, 50, 0, 24, 98, 50, 0};
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
__asm {
mov eax, [esp + 4] /* dst_argb */
mov ecx, [esp + 8] /* width */
@@ -4608,9 +4618,9 @@
// Same as Sepia except matrix is provided.
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const int8* matrix_argb,
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const int8_t* matrix_argb,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
@@ -4670,7 +4680,7 @@
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
int scale,
int interval_size,
int interval_offset,
@@ -4717,10 +4727,10 @@
#ifdef HAS_ARGBSHADEROW_SSE2
// Shade 4 pixels at a time by specified value.
-__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- uint32 value) {
+ uint32_t value) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
@@ -4752,9 +4762,9 @@
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4792,9 +4802,9 @@
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4841,9 +4851,9 @@
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4871,9 +4881,9 @@
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4909,9 +4919,9 @@
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4939,9 +4949,9 @@
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
- const uint8* src_argb1,
- uint8* dst_argb,
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+ const uint8_t* src_argb1,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -4972,10 +4982,10 @@
// -1 0 1
// -2 0 2
// -1 0 1
-__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- const uint8* src_y2,
- uint8* dst_sobelx,
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ const uint8_t* src_y2,
+ uint8_t* dst_sobelx,
int width) {
__asm {
push esi
@@ -5030,9 +5040,9 @@
// -1 -2 -1
// 0 0 0
// 1 2 1
-__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
- const uint8* src_y1,
- uint8* dst_sobely,
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+ const uint8_t* src_y1,
+ uint8_t* dst_sobely,
int width) {
__asm {
push esi
@@ -5084,9 +5094,9 @@
// R = Sobel
// G = Sobel
// B = Sobel
-__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -5132,9 +5142,9 @@
#ifdef HAS_SOBELTOPLANEROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_y,
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_y,
int width) {
__asm {
push esi
@@ -5166,9 +5176,9 @@
// R = Sobel X
// G = Sobel
// B = Sobel Y
-__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
- const uint8* src_sobely,
- uint8* dst_argb,
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+ const uint8_t* src_sobely,
+ uint8_t* dst_argb,
int width) {
__asm {
push esi
@@ -5225,11 +5235,11 @@
// count is number of averaged pixels to produce.
// Does 4 pixels at a time.
// This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft,
- const int32* botleft,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+ const int32_t* botleft,
int width,
int area,
- uint8* dst,
+ uint8_t* dst,
int count) {
__asm {
mov eax, topleft // eax topleft
@@ -5256,7 +5266,7 @@
cvtps2dq xmm5, xmm5 // 0.16 fixed point
packssdw xmm5, xmm5 // 16 bit shorts
- // 4 pixel loop small blocks.
+ // 4 pixel loop small blocks.
s4:
// top left
movdqu xmm0, [eax]
@@ -5298,7 +5308,7 @@
jmp l4b
- // 4 pixel loop
+ // 4 pixel loop
l4:
// top left
movdqu xmm0, [eax]
@@ -5350,7 +5360,7 @@
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@@ -5375,9 +5385,9 @@
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
// Creates a table of cumulative sums where each value is a sum of all values
// above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row,
- int32* cumsum,
- const int32* previous_cumsum,
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+ int32_t* cumsum,
+ const int32_t* previous_cumsum,
int width) {
__asm {
mov eax, row
@@ -5392,7 +5402,7 @@
test edx, 15
jne l4b
- // 4 pixel loop
+ // 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@@ -5438,9 +5448,9 @@
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
- movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes.
lea eax, [eax + 4]
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
@@ -5460,9 +5470,9 @@
#ifdef HAS_ARGBAFFINEROW_SSE2
// Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
int src_argb_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
const float* uv_dudv,
int width) {
__asm {
@@ -5481,7 +5491,7 @@
sub ecx, 4
jl l4b
- // setup for 4 pixel loop
+ // setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@@ -5493,7 +5503,7 @@
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
- // 4 pixel loop
+ // 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@@ -5524,7 +5534,7 @@
add ecx, 4 - 1
jl l1b
- // 1 pixel loop
+ // 1 pixel loop
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@@ -5546,8 +5556,8 @@
#ifdef HAS_INTERPOLATEROW_AVX2
// Bilinear filter 32x2 -> 32x1
-__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
@@ -5598,7 +5608,7 @@
jg xloop
jmp xloop99
- // Blend 50 / 50.
+ // Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
@@ -5608,7 +5618,7 @@
jg xloop50
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
+ // Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@@ -5623,8 +5633,8 @@
// Bilinear filter 16x2 -> 16x1
// TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
@@ -5638,7 +5648,7 @@
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
- // Dispatch to specialized filters if applicable.
+ // Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
@@ -5678,7 +5688,7 @@
jg xloop
jmp xloop99
- // Blend 50 / 50.
+ // Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -5689,7 +5699,7 @@
jg xloop50
jmp xloop99
- // Blend 100 / 0 - Copy row unchanged.
+ // Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
movdqu [esi + edi], xmm0
@@ -5705,9 +5715,9 @@
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -5732,9 +5742,9 @@
}
#ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
+ const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -5761,133 +5771,16 @@
}
#endif // HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
- const uint8* shuffler,
- int width) {
- __asm {
- push ebx
- push esi
- mov eax, [esp + 8 + 4] // src_argb
- mov edx, [esp + 8 + 8] // dst_argb
- mov esi, [esp + 8 + 12] // shuffler
- mov ecx, [esp + 8 + 16] // width
- pxor xmm5, xmm5
-
- mov ebx, [esi] // shuffler
- cmp ebx, 0x03000102
- je shuf_3012
- cmp ebx, 0x00010203
- je shuf_0123
- cmp ebx, 0x00030201
- je shuf_0321
- cmp ebx, 0x02010003
- je shuf_2103
-
- // TODO(fbarchard): Use one source pointer and 3 offsets.
- shuf_any1:
- movzx ebx, byte ptr [esi]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx], bl
- movzx ebx, byte ptr [esi + 1]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 1], bl
- movzx ebx, byte ptr [esi + 2]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 2], bl
- movzx ebx, byte ptr [esi + 3]
- movzx ebx, byte ptr [eax + ebx]
- mov [edx + 3], bl
- lea eax, [eax + 4]
- lea edx, [edx + 4]
- sub ecx, 1
- jg shuf_any1
- jmp shuf99
-
- shuf_0123:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
- pshuflw xmm0, xmm0, 01Bh
- pshufhw xmm1, xmm1, 01Bh
- pshuflw xmm1, xmm1, 01Bh
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0123
- jmp shuf99
-
- shuf_0321:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
- pshuflw xmm0, xmm0, 039h
- pshufhw xmm1, xmm1, 039h
- pshuflw xmm1, xmm1, 039h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_0321
- jmp shuf99
-
- shuf_2103:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
- pshuflw xmm0, xmm0, 093h
- pshufhw xmm1, xmm1, 093h
- pshuflw xmm1, xmm1, 093h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_2103
- jmp shuf99
-
- shuf_3012:
- movdqu xmm0, [eax]
- lea eax, [eax + 16]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
- pshuflw xmm0, xmm0, 0C6h
- pshufhw xmm1, xmm1, 0C6h
- pshuflw xmm1, xmm1, 0C6h
- packuswb xmm0, xmm1
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- sub ecx, 4
- jg shuf_3012
-
- shuf99:
- pop esi
- pop ebx
- ret
- }
-}
-
// YUY2 - Macro-pixel = 2 image pixels
// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
// UYVY - Macro-pixel = 2 image pixels
// U0Y0V0Y1
-__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
__asm {
push esi
@@ -5921,10 +5814,10 @@
}
}
-__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_frame,
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_frame,
int width) {
__asm {
push esi
@@ -5959,8 +5852,8 @@
}
#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
__asm {
@@ -5971,7 +5864,7 @@
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
- // 2 pixel loop.
+ // 2 pixel loop.
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
@@ -6018,8 +5911,8 @@
#endif // HAS_ARGBPOLYNOMIALROW_SSE2
#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+ uint8_t* dst_argb,
const float* poly,
int width) {
__asm {
@@ -6058,8 +5951,8 @@
#ifdef HAS_HALFFLOATROW_SSE2
static float kExpBias = 1.9259299444e-34f;
-__declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6072,7 +5965,7 @@
pxor xmm5, xmm5
sub edx, eax
- // 8 pixel loop.
+ // 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@@ -6095,8 +5988,8 @@
#endif // HAS_HALFFLOATROW_SSE2
#ifdef HAS_HALFFLOATROW_AVX2
-__declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6110,7 +6003,7 @@
vpxor ymm5, ymm5, ymm5
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@@ -6133,8 +6026,8 @@
#endif // HAS_HALFFLOATROW_AVX2
#ifdef HAS_HALFFLOATROW_F16C
-__declspec(naked) void HalfFloatRow_F16C(const uint16* src,
- uint16* dst,
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+ uint16_t* dst,
float scale,
int width) {
__asm {
@@ -6144,7 +6037,7 @@
mov ecx, [esp + 16] /* width */
sub edx, eax
- // 16 pixel loop.
+ // 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
@@ -6167,8 +6060,8 @@
#ifdef HAS_ARGBCOLORTABLEROW_X86
// Tranform ARGB pixels with color table.
-__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
- const uint8* table_argb,
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
int width) {
__asm {
push esi
@@ -6201,8 +6094,8 @@
#ifdef HAS_RGBCOLORTABLEROW_X86
// Tranform RGB pixels with color table.
-__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
- const uint8* table_argb,
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+ const uint8_t* table_argb,
int width) {
__asm {
push esi
@@ -6233,11 +6126,11 @@
#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
// Tranform RGB pixels with luma table.
-__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
- uint8* dst_argb,
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+ uint8_t* dst_argb,
int width,
- const uint8* luma,
- uint32 lumacoeff) {
+ const uint8_t* luma,
+ uint32_t lumacoeff) {
__asm {
push esi
push edi
@@ -6252,7 +6145,7 @@
psllw xmm4, 8
pxor xmm5, xmm5
- // 4 pixel loop.
+ // 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3
diff --git a/files/source/scale.cc b/files/source/scale.cc
index 010ad9d..ab08549 100644
--- a/files/source/scale.cc
+++ b/files/source/scale.cc
@@ -39,12 +39,12 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
+ void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) =
filtering == kFilterNone
? ScaleRowDown2_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_C
@@ -103,13 +103,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN2_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
- }
-#endif
#if defined(HAS_SCALEROWDOWN2_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown2 =
@@ -125,6 +118,21 @@
}
}
#endif
+#if defined(HAS_SCALEROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown2 =
+ filtering == kFilterNone
+ ? ScaleRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI
+ : ScaleRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_MMI
+ : ScaleRowDown2Box_MMI);
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -143,12 +151,12 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) =
+ void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width) =
filtering == kFilterNone
? ScaleRowDown2_16_C
: (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
@@ -176,12 +184,12 @@
: ScaleRowDown2Box_16_SSE2);
}
#endif
-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown2 =
- filtering ? ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+#if defined(HAS_SCALEROWDOWN2_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI
+ : (filtering == kFilterLinear
+ ? ScaleRowDown2Linear_16_MMI
+ : ScaleRowDown2Box_16_MMI);
}
#endif
@@ -206,12 +214,12 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) =
+ void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
int row_stride = src_stride << 2;
(void)src_width;
@@ -247,13 +255,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN4_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
- }
-#endif
#if defined(HAS_SCALEROWDOWN4_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ScaleRowDown4 =
@@ -263,6 +264,15 @@
}
}
#endif
+#if defined(HAS_SCALEROWDOWN4_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleRowDown4 =
+ filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI;
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -280,12 +290,12 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width) =
+ void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width) =
filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
int row_stride = src_stride << 2;
(void)src_width;
@@ -306,12 +316,9 @@
filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
}
#endif
-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- ScaleRowDown4 =
- filtering ? ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+#if defined(HAS_SCALEROWDOWN4_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI;
}
#endif
@@ -332,14 +339,14 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
(void)src_width;
(void)src_height;
@@ -371,6 +378,26 @@
}
}
#endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+ }
+ if (dst_width % 48 == 0) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_MSA;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+ }
+ }
+ }
+#endif
#if defined(HAS_SCALEROWDOWN34_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
if (!filtering) {
@@ -391,19 +418,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
- }
- }
-#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -434,14 +448,14 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
(void)src_width;
(void)src_height;
@@ -475,19 +489,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
- } else {
- ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
- ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
- }
- }
-#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -533,14 +534,14 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+ uint8_t* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
assert(dst_width % 3 == 0);
(void)src_width;
@@ -592,19 +593,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN38_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
- }
- }
-#endif
#if defined(HAS_SCALEROWDOWN38_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
if (!filtering) {
@@ -655,14 +643,14 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
int y;
- void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
- void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+ uint16_t* dst_ptr, int dst_width);
const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
(void)src_width;
(void)src_height;
@@ -696,19 +684,6 @@
}
}
#endif
-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
- IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
- IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
- if (!filtering) {
- ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
- } else {
- ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
- ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
- }
- }
-#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -735,8 +710,8 @@
#define MIN1(x) ((x) < 1 ? 1 : (x))
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
- uint32 sum = 0u;
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+ uint32_t sum = 0u;
int x;
assert(iboxwidth > 0);
for (x = 0; x < iboxwidth; ++x) {
@@ -745,8 +720,8 @@
return sum;
}
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
- uint32 sum = 0u;
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+ uint32_t sum = 0u;
int x;
assert(iboxwidth > 0);
for (x = 0; x < iboxwidth; ++x) {
@@ -759,8 +734,8 @@
int boxheight,
int x,
int dx,
- const uint16* src_ptr,
- uint8* dst_ptr) {
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
@@ -781,8 +756,8 @@
int boxheight,
int x,
int dx,
- const uint32* src_ptr,
- uint16* dst_ptr) {
+ const uint32_t* src_ptr,
+ uint16_t* dst_ptr) {
int i;
int scaletbl[2];
int minboxwidth = dx >> 16;
@@ -802,11 +777,12 @@
static void ScaleAddCols0_C(int dst_width,
int boxheight,
int x,
- int,
- const uint16* src_ptr,
- uint8* dst_ptr) {
+ int dx,
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
int scaleval = 65536 / boxheight;
int i;
+ (void)dx;
src_ptr += (x >> 16);
for (i = 0; i < dst_width; ++i) {
*dst_ptr++ = src_ptr[i] * scaleval >> 16;
@@ -817,8 +793,8 @@
int boxheight,
int x,
int dx,
- const uint16* src_ptr,
- uint8* dst_ptr) {
+ const uint16_t* src_ptr,
+ uint8_t* dst_ptr) {
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
@@ -833,8 +809,8 @@
int boxheight,
int x,
int dx,
- const uint32* src_ptr,
- uint16* dst_ptr) {
+ const uint32_t* src_ptr,
+ uint16_t* dst_ptr) {
int boxwidth = MIN1(dx >> 16);
int scaleval = 65536 / (boxwidth * boxheight);
int i;
@@ -857,8 +833,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr) {
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -870,14 +846,14 @@
&dx, &dy);
src_width = Abs(src_width);
{
- // Allocate a row buffer of uint16.
+ // Allocate a row buffer of uint16_t.
align_buffer_64(row16, src_width * 2);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint16* src_ptr, uint8* dst_ptr) =
+ const uint16_t* src_ptr, uint8_t* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_C
: ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
- void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
- ScaleAddRow_C;
+ void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+ int src_width) = ScaleAddRow_C;
#if defined(HAS_SCALEADDROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleAddRow = ScaleAddRow_Any_SSE2;
@@ -910,11 +886,11 @@
}
}
#endif
-#if defined(HAS_SCALEADDROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- ScaleAddRow = ScaleAddRow_Any_DSPR2;
- if (IS_ALIGNED(src_width, 16)) {
- ScaleAddRow = ScaleAddRow_DSPR2;
+#if defined(HAS_SCALEADDROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleAddRow = ScaleAddRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 8)) {
+ ScaleAddRow = ScaleAddRow_MMI;
}
}
#endif
@@ -922,7 +898,7 @@
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint8* src = src_ptr + iy * src_stride;
+ const uint8_t* src = src_ptr + iy * src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -930,10 +906,10 @@
boxheight = MIN1((y >> 16) - iy);
memset(row16, 0, src_width * 2);
for (k = 0; k < boxheight; ++k) {
- ScaleAddRow(src, (uint16*)(row16), src_width);
+ ScaleAddRow(src, (uint16_t*)(row16), src_width);
src += src_stride;
}
- ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
dst_ptr += dst_stride;
}
free_aligned_buffer_64(row16);
@@ -946,8 +922,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr) {
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
int j, k;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -959,13 +935,13 @@
&dx, &dy);
src_width = Abs(src_width);
{
- // Allocate a row buffer of uint32.
+ // Allocate a row buffer of uint32_t.
align_buffer_64(row32, src_width * 4);
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
- const uint32* src_ptr, uint16* dst_ptr) =
+ const uint32_t* src_ptr, uint16_t* dst_ptr) =
(dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
- void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
- ScaleAddRow_16_C;
+ void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+ int src_width) = ScaleAddRow_16_C;
#if defined(HAS_SCALEADDROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
@@ -973,10 +949,15 @@
}
#endif
+#if defined(HAS_SCALEADDROW_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) {
+ ScaleAddRow = ScaleAddRow_16_MMI;
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int boxheight;
int iy = y >> 16;
- const uint16* src = src_ptr + iy * src_stride;
+ const uint16_t* src = src_ptr + iy * src_stride;
y += dy;
if (y > max_y) {
y = max_y;
@@ -984,10 +965,10 @@
boxheight = MIN1((y >> 16) - iy);
memset(row32, 0, src_width * 4);
for (k = 0; k < boxheight; ++k) {
- ScaleAddRow(src, (uint32*)(row32), src_width);
+ ScaleAddRow(src, (uint32_t*)(row32), src_width);
src += src_stride;
}
- ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+ ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
dst_ptr += dst_stride;
}
free_aligned_buffer_64(row32);
@@ -1001,8 +982,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1015,10 +996,10 @@
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
- int x, int dx) =
+ void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
- void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1049,14 +1030,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
- if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -1065,6 +1038,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
#if defined(HAS_SCALEFILTERCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1079,13 +1060,21 @@
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_MSA;
+ }
+ }
+#endif
if (y > max_y) {
y = max_y;
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
@@ -1108,8 +1097,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1122,10 +1111,10 @@
const int max_y = (src_height - 1) << 16;
int j;
- void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
- void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1164,14 +1153,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
- if (IS_ALIGNED(src_width, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
- }
- }
-#endif
#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1184,13 +1165,13 @@
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint16* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
} else {
int yf = (y >> 8) & 255;
- InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
- ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+ InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
}
dst_ptr += dst_stride;
y += dy;
@@ -1208,8 +1189,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr,
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr,
enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1218,11 +1199,11 @@
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint8 * dst_ptr, const uint8* src_ptr,
+ void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleFilterCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width,
- int x, int dx) =
+ void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+ int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_C : ScaleCols_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
&dx, &dy);
@@ -1252,14 +1233,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
- }
-#endif
if (filtering && src_width >= 32768) {
ScaleFilterCols = ScaleFilterCols64_C;
@@ -1277,6 +1250,14 @@
}
}
#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+ ScaleFilterCols = ScaleFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 16)) {
+ ScaleFilterCols = ScaleFilterCols_MSA;
+ }
+ }
+#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleFilterCols = ScaleColsUp2_C;
#if defined(HAS_SCALECOLS_SSE2)
@@ -1284,6 +1265,11 @@
ScaleFilterCols = ScaleColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -1291,13 +1277,13 @@
}
{
int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
+ const uint8_t* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
- uint8* rowptr = row;
+ uint8_t* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
@@ -1343,8 +1329,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr,
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr,
enum FilterMode filtering) {
int j;
// Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1353,10 +1339,10 @@
int dx = 0;
int dy = 0;
const int max_y = (src_height - 1) << 16;
- void (*InterpolateRow)(uint16 * dst_ptr, const uint16* src_ptr,
+ void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
- void (*ScaleFilterCols)(uint16 * dst_ptr, const uint16* src_ptr,
+ void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
int dst_width, int x, int dx) =
filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
@@ -1395,14 +1381,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
- if (IS_ALIGNED(dst_width, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
- }
- }
-#endif
if (filtering && src_width >= 32768) {
ScaleFilterCols = ScaleFilterCols64_16_C;
@@ -1419,6 +1397,11 @@
ScaleFilterCols = ScaleColsUp2_16_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleFilterCols = ScaleColsUp2_16_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -1426,13 +1409,13 @@
}
{
int yi = y >> 16;
- const uint16* src = src_ptr + yi * src_stride;
+ const uint16_t* src = src_ptr + yi * src_stride;
// Allocate 2 row buffers.
const int kRowSize = (dst_width + 31) & ~31;
align_buffer_64(row, kRowSize * 4);
- uint16* rowptr = (uint16*)row;
+ uint16_t* rowptr = (uint16_t*)row;
int rowstride = kRowSize;
int lasty = yi;
@@ -1483,11 +1466,11 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_ptr,
- uint8* dst_ptr) {
+ const uint8_t* src_ptr,
+ uint8_t* dst_ptr) {
int i;
- void (*ScaleCols)(uint8 * dst_ptr, const uint8* src_ptr, int dst_width, int x,
- int dx) = ScaleCols_C;
+ void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+ int x, int dx) = ScaleCols_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
@@ -1504,6 +1487,11 @@
ScaleCols = ScaleColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_MMI;
+ }
+#endif
}
for (i = 0; i < dst_height; ++i) {
@@ -1519,10 +1507,10 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_ptr,
- uint16* dst_ptr) {
+ const uint16_t* src_ptr,
+ uint16_t* dst_ptr) {
int i;
- void (*ScaleCols)(uint16 * dst_ptr, const uint16* src_ptr, int dst_width,
+ void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
int x, int dx) = ScaleCols_16_C;
// Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
@@ -1540,6 +1528,11 @@
ScaleCols = ScaleColsUp2_16_SSE2;
}
#endif
+#if defined(HAS_SCALECOLS_16_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) {
+ ScaleCols = ScaleColsUp2_16_MMI;
+ }
+#endif
}
for (i = 0; i < dst_height; ++i) {
@@ -1553,11 +1546,11 @@
// This function dispatches to a specialized scaler based on scale factor.
LIBYUV_API
-void ScalePlane(const uint8* src,
+void ScalePlane(const uint8_t* src,
int src_stride,
int src_width,
int src_height,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int dst_width,
int dst_height,
@@ -1636,11 +1629,11 @@
}
LIBYUV_API
-void ScalePlane_16(const uint16* src,
+void ScalePlane_16(const uint16_t* src,
int src_stride,
int src_width,
int src_height,
- uint16* dst,
+ uint16_t* dst,
int dst_stride,
int dst_width,
int dst_height,
@@ -1663,7 +1656,7 @@
CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
return;
}
- if (dst_width == src_width) {
+ if (dst_width == src_width && filtering != kFilterBox) {
int dy = FixedDiv(src_height, dst_height);
// Arbitrary scale vertically, but unscaled vertically.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
@@ -1692,7 +1685,7 @@
return;
}
if (4 * dst_width == src_width && 4 * dst_height == src_height &&
- filtering != kFilterBilinear) {
+ (filtering == kFilterBox || filtering == kFilterNone)) {
// optimized, 1/4
ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@@ -1722,19 +1715,19 @@
// This function in turn calls a scaling function for each plane.
LIBYUV_API
-int I420Scale(const uint8* src_y,
+int I420Scale(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
int src_width,
int src_height,
- uint8* dst_y,
+ uint8_t* dst_y,
int dst_stride_y,
- uint8* dst_u,
+ uint8_t* dst_u,
int dst_stride_u,
- uint8* dst_v,
+ uint8_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
@@ -1759,19 +1752,19 @@
}
LIBYUV_API
-int I420Scale_16(const uint16* src_y,
+int I420Scale_16(const uint16_t* src_y,
int src_stride_y,
- const uint16* src_u,
+ const uint16_t* src_u,
int src_stride_u,
- const uint16* src_v,
+ const uint16_t* src_v,
int src_stride_v,
int src_width,
int src_height,
- uint16* dst_y,
+ uint16_t* dst_y,
int dst_stride_y,
- uint16* dst_u,
+ uint16_t* dst_u,
int dst_stride_u,
- uint16* dst_v,
+ uint16_t* dst_v,
int dst_stride_v,
int dst_width,
int dst_height,
@@ -1795,19 +1788,88 @@
return 0;
}
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* src_u,
+ int src_stride_u,
+ const uint8_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_u,
+ int dst_stride_u,
+ uint8_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+ int src_stride_y,
+ const uint16_t* src_u,
+ int src_stride_u,
+ const uint16_t* src_v,
+ int src_stride_v,
+ int src_width,
+ int src_height,
+ uint16_t* dst_y,
+ int dst_stride_y,
+ uint16_t* dst_u,
+ int dst_stride_u,
+ uint16_t* dst_v,
+ int dst_stride_v,
+ int dst_width,
+ int dst_height,
+ enum FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+ dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+
+ ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+ dst_width, dst_height, filtering);
+ ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+ dst_width, dst_height, filtering);
+ return 0;
+}
+
// Deprecated api
LIBYUV_API
-int Scale(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
+int Scale(const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
int src_stride_y,
int src_stride_u,
int src_stride_v,
int src_width,
int src_height,
- uint8* dst_y,
- uint8* dst_u,
- uint8* dst_v,
+ uint8_t* dst_y,
+ uint8_t* dst_u,
+ uint8_t* dst_v,
int dst_stride_y,
int dst_stride_u,
int dst_stride_v,
@@ -1820,43 +1882,6 @@
dst_height, interpolate ? kFilterBox : kFilterNone);
}
-// Deprecated api
-LIBYUV_API
-int ScaleOffset(const uint8* src,
- int src_width,
- int src_height,
- uint8* dst,
- int dst_width,
- int dst_height,
- int dst_yoffset,
- LIBYUV_BOOL interpolate) {
- // Chroma requires offset to multiple of 2.
- int dst_yoffset_even = dst_yoffset & ~1;
- int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
- int src_halfheight = SUBSAMPLE(src_height, 1, 1);
- int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
- int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
- int aheight = dst_height - dst_yoffset_even * 2; // actual output height
- const uint8* src_y = src;
- const uint8* src_u = src + src_width * src_height;
- const uint8* src_v =
- src + src_width * src_height + src_halfwidth * src_halfheight;
- uint8* dst_y = dst + dst_yoffset_even * dst_width;
- uint8* dst_u =
- dst + dst_width * dst_height + (dst_yoffset_even >> 1) * dst_halfwidth;
- uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
- (dst_yoffset_even >> 1) * dst_halfwidth;
- if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 ||
- dst_height <= 0 || dst_yoffset_even < 0 ||
- dst_yoffset_even >= dst_height) {
- return -1;
- }
- return I420Scale(src_y, src_width, src_u, src_halfwidth, src_v, src_halfwidth,
- src_width, src_height, dst_y, dst_width, dst_u,
- dst_halfwidth, dst_v, dst_halfwidth, dst_width, aheight,
- interpolate ? kFilterBox : kFilterNone);
-}
-
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_any.cc b/files/source/scale_any.cc
index d64ba7a..1783137 100644
--- a/files/source/scale_any.cc
+++ b/files/source/scale_any.cc
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <string.h> // For memset/memcpy
+
#include "libyuv/scale.h"
#include "libyuv/scale_row.h"
@@ -19,22 +21,32 @@
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
- void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, \
- int dx) { \
- int n = dst_width & ~MASK; \
- if (n > 0) { \
- TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
- } \
- TERP_C(dst_ptr + n * BPP, src_ptr, dst_width & MASK, x + n * dx, dx); \
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
+ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+ int dx) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
+ } \
+ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
@@ -42,34 +54,42 @@
4,
3)
#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+ ScaleARGBFilterCols_MSA,
+ ScaleARGBFilterCols_C,
+ 4,
+ 7)
+#endif
#undef CANY
// Fixed scale down.
-#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
- int dst_width) { \
- int r = (int)((unsigned int)dst_width % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
+// Mask may be non-power of 2, so use MOD
+#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \
+ int n = dst_width - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r); \
}
// Fixed scale down for odd source width. Used by I420Blend subsampling.
// Since dst_width is (width + 1) / 2, this function scales one less pixel
// and copies the last pixel.
-#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, \
- int dst_width) { \
- int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
- dst_ptr + n * BPP, r); \
+#define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+ int dst_width) { \
+ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \
+ int n = (dst_width - 1) - r; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \
+ dst_ptr + n * BPP, r + 1); \
}
#ifdef HAS_SCALEROWDOWN2_SSSE3
@@ -150,6 +170,27 @@
1,
31)
#endif
+#ifdef HAS_SCALEROWDOWN2_MMI
+SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7)
+SDANY(ScaleRowDown2Linear_Any_MMI,
+ ScaleRowDown2Linear_MMI,
+ ScaleRowDown2Linear_C,
+ 2,
+ 1,
+ 7)
+SDANY(ScaleRowDown2Box_Any_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_C,
+ 2,
+ 1,
+ 7)
+SDODD(ScaleRowDown2Box_Odd_MMI,
+ ScaleRowDown2Box_MMI,
+ ScaleRowDown2Box_Odd_C,
+ 2,
+ 1,
+ 7)
+#endif
#ifdef HAS_SCALEROWDOWN4_SSSE3
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
SDANY(ScaleRowDown4Box_Any_SSSE3,
@@ -186,6 +227,15 @@
1,
15)
#endif
+#ifdef HAS_SCALEROWDOWN4_MMI
+SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_MMI,
+ ScaleRowDown4Box_MMI,
+ ScaleRowDown4Box_C,
+ 4,
+ 1,
+ 7)
+#endif
#ifdef HAS_SCALEROWDOWN34_SSSE3
SDANY(ScaleRowDown34_Any_SSSE3,
ScaleRowDown34_SSSE3,
@@ -226,6 +276,26 @@
1,
23)
#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+ ScaleRowDown34_MSA,
+ ScaleRowDown34_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+ ScaleRowDown34_0_Box_MSA,
+ ScaleRowDown34_0_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+ ScaleRowDown34_1_Box_MSA,
+ ScaleRowDown34_1_Box_C,
+ 4 / 3,
+ 1,
+ 47)
+#endif
#ifdef HAS_SCALEROWDOWN38_SSSE3
SDANY(ScaleRowDown38_Any_SSSE3,
ScaleRowDown38_SSSE3,
@@ -347,19 +417,39 @@
4,
3)
#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MMI
+SDANY(ScaleARGBRowDown2_Any_MMI,
+ ScaleARGBRowDown2_MMI,
+ ScaleARGBRowDown2_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Linear_Any_MMI,
+ ScaleARGBRowDown2Linear_MMI,
+ ScaleARGBRowDown2Linear_C,
+ 2,
+ 4,
+ 1)
+SDANY(ScaleARGBRowDown2Box_Any_MMI,
+ ScaleARGBRowDown2Box_MMI,
+ ScaleARGBRowDown2Box_C,
+ 2,
+ 4,
+ 1)
+#endif
#undef SDANY
// Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
- void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \
- uint8* dst_ptr, int dst_width) { \
- int r = (int)((unsigned int)dst_width % (MASK + 1)); \
- int n = dst_width - r; \
- if (n > 0) { \
- SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
- } \
- SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
- dst_ptr + n * BPP, r); \
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+ uint8_t* dst_ptr, int dst_width) { \
+ int r = dst_width & MASK; \
+ int n = dst_width & ~MASK; \
+ if (n > 0) { \
+ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \
+ } \
+ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \
+ dst_ptr + n * BPP, r); \
}
#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
@@ -398,15 +488,66 @@
4,
3)
#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MMI
+SDAANY(ScaleARGBRowDownEven_Any_MMI,
+ ScaleARGBRowDownEven_MMI,
+ ScaleARGBRowDownEven_C,
+ 4,
+ 1)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
+ ScaleARGBRowDownEvenBox_MMI,
+ ScaleARGBRowDownEvenBox_C,
+ 4,
+ 1)
+#endif
+
+#ifdef SASIMDONLY
+// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
+
+// Add rows box filter scale down. Using macro from row_any
+#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
+ SIMD_ALIGNED(uint16_t dst_temp[32]); \
+ SIMD_ALIGNED(uint8_t src_temp[32]); \
+ memset(dst_temp, 0, 32 * 2); /* for msan */ \
+ int r = width & MASK; \
+ int n = width & ~MASK; \
+ if (n > 0) { \
+ ANY_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
+ memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
+ ANY_SIMD(src_temp, dst_temp, MASK + 1); \
+ memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
+ }
+
+#ifdef HAS_SCALEADDROW_SSE2
+SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_AVX2
+SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
+#endif
+#ifdef HAS_SCALEADDROW_NEON
+SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MSA
+SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
+#endif
+#ifdef HAS_SCALEADDROW_MMI
+SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
+#endif
+#undef SAANY
+
+#else
// Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
- void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
- int n = src_width & ~MASK; \
- if (n > 0) { \
- SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
- } \
- SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
+ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+ int n = src_width & ~MASK; \
+ if (n > 0) { \
+ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
+ } \
+ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
}
#ifdef HAS_SCALEADDROW_SSE2
@@ -421,11 +562,13 @@
#ifdef HAS_SCALEADDROW_MSA
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
#endif
-#ifdef HAS_SCALEADDROW_DSPR2
-SAANY(ScaleAddRow_Any_DSPR2, ScaleAddRow_DSPR2, ScaleAddRow_C, 15)
+#ifdef HAS_SCALEADDROW_MMI
+SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
#endif
#undef SAANY
+#endif // SASIMDONLY
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_argb.cc b/files/source/scale_argb.cc
index 1ea28f0..beef380 100644
--- a/files/source/scale_argb.cc
+++ b/files/source/scale_argb.cc
@@ -36,8 +36,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
@@ -45,8 +45,8 @@
enum FilterMode filtering) {
int j;
int row_stride = src_stride * (dy >> 16);
- void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) =
+ void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ uint8_t* dst_argb, int dst_width) =
filtering == kFilterNone
? ScaleARGBRowDown2_C
: (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
@@ -111,6 +111,22 @@
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_Any_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI
+ : ScaleARGBRowDown2Box_Any_MMI);
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDown2 =
+ filtering == kFilterNone
+ ? ScaleARGBRowDown2_MMI
+ : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI
+ : ScaleARGBRowDown2Box_MMI);
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -131,8 +147,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
@@ -142,8 +158,8 @@
const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
int row_stride = src_stride * (dy >> 16);
- void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) =
+ void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ uint8_t* dst_argb, int dst_width) =
ScaleARGBRowDown2Box_C;
// Advance to odd row, even column.
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
@@ -189,8 +205,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
@@ -199,8 +215,8 @@
int j;
int col_step = dx >> 16;
int row_stride = (dy >> 16) * src_stride;
- void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
- int src_step, uint8* dst_argb, int dst_width) =
+ void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8_t* dst_argb, int dst_width) =
filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
(void)src_width;
(void)src_height;
@@ -237,6 +253,16 @@
}
}
#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI
+ : ScaleARGBRowDownEven_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ ScaleARGBRowDownEven =
+ filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI;
+ }
+ }
+#endif
if (filtering == kFilterLinear) {
src_stride = 0;
@@ -255,23 +281,23 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
- int64 xlast = x + (int64)(dst_width - 1) * dx;
- int64 xl = (dx >= 0) ? x : xlast;
- int64 xr = (dx >= 0) ? xlast : x;
+ int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+ int64_t xl = (dx >= 0) ? x : xlast;
+ int64_t xr = (dx >= 0) ? xlast : x;
int clip_src_width;
xl = (xl >> 16) & ~3; // Left edge aligned.
xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
@@ -306,15 +332,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
- IS_ALIGNED(src_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
- if (IS_ALIGNED(clip_src_width, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -336,6 +353,14 @@
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
@@ -347,7 +372,7 @@
}
for (j = 0; j < dst_height; ++j) {
int yi = y >> 16;
- const uint8* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * src_stride;
if (filtering == kFilterLinear) {
ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
} else {
@@ -372,18 +397,18 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
- void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
- void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16;
@@ -411,12 +436,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
- IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -425,6 +444,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width, 2)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
if (src_width >= 32768) {
ScaleARGBFilterCols =
filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
@@ -442,6 +469,14 @@
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -455,6 +490,22 @@
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -462,6 +513,11 @@
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
if (y > max_y) {
@@ -470,13 +526,13 @@
{
int yi = y >> 16;
- const uint8* src = src_argb + yi * src_stride;
+ const uint8_t* src = src_argb + yi * src_stride;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
- uint8* rowptr = row;
+ uint8_t* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
@@ -526,18 +582,18 @@
int src_stride_u,
int src_stride_v,
int dst_stride_argb,
- const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- uint8* dst_argb,
+ const uint8_t* src_y,
+ const uint8_t* src_u,
+ const uint8_t* src_v,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
int dy,
enum FilterMode filtering) {
int j;
- void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
- const uint8* v_buf, uint8* rgb_buf, int width) =
+ void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+ const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
I422ToARGBRow_C;
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -563,15 +619,6 @@
}
}
#endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
- IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
- IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
- IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
- IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
- I422ToARGBRow = I422ToARGBRow_DSPR2;
- }
-#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
@@ -581,7 +628,7 @@
}
#endif
- void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -608,12 +655,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(dst_argb, 4) &&
- IS_ALIGNED(dst_stride_argb, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -623,7 +664,7 @@
}
#endif
- void (*ScaleARGBFilterCols)(uint8 * dst_argb, const uint8* src_argb,
+ void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
if (src_width >= 32768) {
@@ -643,6 +684,14 @@
}
}
#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+ if (filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 8)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+ }
+ }
+#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -656,6 +705,22 @@
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (!filtering && TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBFilterCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBFilterCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -663,6 +728,11 @@
ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
const int max_y = (src_height - 1) << 16;
@@ -672,9 +742,9 @@
const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
int yi = y >> 16;
int uv_yi = yi >> kYShift;
- const uint8* src_row_y = src_y + yi * src_stride_y;
- const uint8* src_row_u = src_u + uv_yi * src_stride_u;
- const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+ const uint8_t* src_row_y = src_y + yi * src_stride_y;
+ const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
// Allocate 2 rows of ARGB.
const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -683,7 +753,7 @@
// Allocate 1 row of ARGB for source conversion.
align_buffer_64(argb_row, src_width * 4);
- uint8* rowptr = row;
+ uint8_t* rowptr = row;
int rowstride = kRowSize;
int lasty = yi;
@@ -755,15 +825,15 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int dx,
int y,
int dy) {
int j;
- void (*ScaleARGBCols)(uint8 * dst_argb, const uint8* src_argb, int dst_width,
- int x, int dx) =
+ void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+ int dst_width, int x, int dx) =
(src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
(void)src_height;
#if defined(HAS_SCALEARGBCOLS_SSE2)
@@ -779,6 +849,22 @@
}
}
#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+ if (TestCpuFlag(kCpuHasMSA)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MSA;
+ if (IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBCols_MSA;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ ScaleARGBCols = ScaleARGBCols_Any_MMI;
+ if (IS_ALIGNED(dst_width, 1)) {
+ ScaleARGBCols = ScaleARGBCols_MMI;
+ }
+ }
+#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleARGBCols = ScaleARGBColsUp2_C;
#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
@@ -786,6 +872,11 @@
ScaleARGBCols = ScaleARGBColsUp2_SSE2;
}
#endif
+#if defined(HAS_SCALEARGBCOLSUP2_MMI)
+ if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) {
+ ScaleARGBCols = ScaleARGBColsUp2_MMI;
+ }
+#endif
}
for (j = 0; j < dst_height; ++j) {
@@ -799,11 +890,11 @@
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src,
+static void ScaleARGB(const uint8_t* src,
int src_stride,
int src_width,
int src_height,
- uint8* dst,
+ uint8_t* dst,
int dst_stride,
int dst_width,
int dst_height,
@@ -832,13 +923,13 @@
&dx, &dy);
src_width = Abs(src_width);
if (clip_x) {
- int64 clipf = (int64)(clip_x)*dx;
+ int64_t clipf = (int64_t)(clip_x)*dx;
x += (clipf & 0xffff);
src += (clipf >> 16) * 4;
dst += clip_x * 4;
}
if (clip_y) {
- int64 clipf = (int64)(clip_y)*dy;
+ int64_t clipf = (int64_t)(clip_y)*dy;
y += (clipf & 0xffff);
src += (clipf >> 16) * src_stride;
dst += clip_y * dst_stride;
@@ -904,11 +995,11 @@
}
LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb,
+int ARGBScaleClip(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -932,11 +1023,11 @@
// Scale an ARGB image.
LIBYUV_API
-int ARGBScale(const uint8* src_argb,
+int ARGBScale(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -953,18 +1044,18 @@
// Scale with YUV conversion to ARGB and clipping.
LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y,
+int YUVToARGBScaleClip(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint32 src_fourcc,
+ uint32_t src_fourcc,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- uint32 dst_fourcc,
+ uint32_t dst_fourcc,
int dst_width,
int dst_height,
int clip_x,
@@ -972,7 +1063,7 @@
int clip_width,
int clip_height,
enum FilterMode filtering) {
- uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+ uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
int r;
(void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc;
diff --git a/files/source/scale_common.cc b/files/source/scale_common.cc
index 1bef39d..6369027 100644
--- a/files/source/scale_common.cc
+++ b/files/source/scale_common.cc
@@ -28,9 +28,9 @@
}
// CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr,
+void ScaleRowDown2_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -45,9 +45,9 @@
}
}
-void ScaleRowDown2_16_C(const uint16* src_ptr,
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -62,11 +62,11 @@
}
}
-void ScaleRowDown2Linear_C(const uint8* src_ptr,
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- const uint8* s = src_ptr;
+ const uint8_t* s = src_ptr;
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
@@ -80,11 +80,11 @@
}
}
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr,
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
- const uint16* s = src_ptr;
+ const uint16_t* s = src_ptr;
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
@@ -98,12 +98,12 @@
}
}
-void ScaleRowDown2Box_C(const uint8* src_ptr,
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -117,12 +117,12 @@
}
}
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr,
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
int x;
dst_width -= 1;
for (x = 0; x < dst_width - 1; x += 2) {
@@ -141,12 +141,12 @@
dst[0] = (s[0] + t[0] + 1) >> 1;
}
-void ScaleRowDown2Box_16_C(const uint16* src_ptr,
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -160,9 +160,9 @@
}
}
-void ScaleRowDown4_C(const uint8* src_ptr,
+void ScaleRowDown4_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -177,9 +177,9 @@
}
}
-void ScaleRowDown4_16_C(const uint16* src_ptr,
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -194,9 +194,9 @@
}
}
-void ScaleRowDown4Box_C(const uint8* src_ptr,
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
@@ -232,9 +232,9 @@
}
}
-void ScaleRowDown4Box_16_C(const uint16* src_ptr,
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
@@ -270,9 +270,9 @@
}
}
-void ScaleRowDown34_C(const uint8* src_ptr,
+void ScaleRowDown34_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -286,9 +286,9 @@
}
}
-void ScaleRowDown34_16_C(const uint16* src_ptr,
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -303,21 +303,21 @@
}
// Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* d,
+ uint8_t* d,
int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 * 3 + b0 + 2) >> 2;
d[1] = (a1 * 3 + b1 + 2) >> 2;
d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -327,21 +327,21 @@
}
}
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* d,
+ uint16_t* d,
int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
- uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 * 3 + b0 + 2) >> 2;
d[1] = (a1 * 3 + b1 + 2) >> 2;
d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -352,21 +352,21 @@
}
// Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* d,
+ uint8_t* d,
int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 + b0 + 1) >> 1;
d[1] = (a1 + b1 + 1) >> 1;
d[2] = (a2 + b2 + 1) >> 1;
@@ -376,21 +376,21 @@
}
}
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* d,
+ uint16_t* d,
int dst_width) {
- const uint16* s = src_ptr;
- const uint16* t = src_ptr + src_stride;
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
int x;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (x = 0; x < dst_width; x += 3) {
- uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
d[0] = (a0 + b0 + 1) >> 1;
d[1] = (a1 + b1 + 1) >> 1;
d[2] = (a2 + b2 + 1) >> 1;
@@ -401,8 +401,8 @@
}
// Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -419,8 +419,8 @@
}
}
-void ScaleCols_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -438,8 +438,8 @@
}
// Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -456,8 +456,8 @@
}
}
-void ScaleColsUp2_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -477,15 +477,15 @@
// (1-f)a + fb can be replaced with a + f(b-a)
#if defined(__arm__) || defined(__aarch64__)
#define BLENDER(a, b, f) \
- (uint8)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+ (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
#else
// Intel uses 7 bit math with rounding.
#define BLENDER(a, b, f) \
- (uint8)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+ (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
#endif
-void ScaleFilterCols_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -511,15 +511,15 @@
}
}
-void ScaleFilterCols64_C(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x32,
int dx) {
- int64 x = (int64)(x32);
+ int64_t x = (int64_t)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -532,7 +532,7 @@
dst_ptr += 2;
}
if (dst_width & 1) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -540,12 +540,14 @@
}
#undef BLENDER
-// Same as 8 bit arm blender but return is cast to uint16
+// Same as 8 bit arm blender but return is cast to uint16_t
#define BLENDER(a, b, f) \
- (uint16)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+ (uint16_t)( \
+ (int)(a) + \
+ (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16))
-void ScaleFilterCols_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -571,15 +573,15 @@
}
}
-void ScaleFilterCols64_16_C(uint16* dst_ptr,
- const uint16* src_ptr,
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
int dst_width,
int x32,
int dx) {
- int64 x = (int64)(x32);
+ int64_t x = (int64_t)(x32);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -592,7 +594,7 @@
dst_ptr += 2;
}
if (dst_width & 1) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int a = src_ptr[xi];
int b = src_ptr[xi + 1];
dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -600,9 +602,9 @@
}
#undef BLENDER
-void ScaleRowDown38_C(const uint8* src_ptr,
+void ScaleRowDown38_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -616,9 +618,9 @@
}
}
-void ScaleRowDown38_16_C(const uint16* src_ptr,
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst,
+ uint16_t* dst,
int dst_width) {
int x;
(void)src_stride;
@@ -633,9 +635,9 @@
}
// 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
@@ -663,9 +665,9 @@
}
}
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr,
+ uint16_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
@@ -694,9 +696,9 @@
}
// 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
@@ -719,9 +721,9 @@
}
}
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
- uint16* dst_ptr,
+ uint16_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
@@ -744,7 +746,7 @@
}
}
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
int x;
assert(src_width > 0);
for (x = 0; x < src_width - 1; x += 2) {
@@ -758,7 +760,9 @@
}
}
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width) {
int x;
assert(src_width > 0);
for (x = 0; x < src_width - 1; x += 2) {
@@ -772,12 +776,12 @@
}
}
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int x;
(void)src_stride;
for (x = 0; x < dst_width - 1; x += 2) {
@@ -791,9 +795,9 @@
}
}
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
int x;
(void)src_stride;
@@ -807,9 +811,9 @@
}
}
-void ScaleARGBRowDown2Box_C(const uint8* src_argb,
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
@@ -830,13 +834,13 @@
}
}
-void ScaleARGBRowDownEven_C(const uint8* src_argb,
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
(void)src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
@@ -850,10 +854,10 @@
}
}
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
int x;
for (x = 0; x < dst_width; ++x) {
@@ -875,13 +879,13 @@
}
// Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst[0] = src[x >> 16];
@@ -895,14 +899,14 @@
}
}
-void ScaleARGBCols64_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x32,
int dx) {
- int64 x = (int64)(x32);
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ int64_t x = (int64_t)(x32);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
dst[0] = src[x >> 16];
@@ -917,13 +921,13 @@
}
// Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int j;
(void)x;
(void)dx;
@@ -941,24 +945,24 @@
// Mimics SSSE3 blender
#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
#define BLENDERC(a, b, f, s) \
- (uint32)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+ (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
#define BLENDER(a, b, f) \
BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
BLENDERC(a, b, f, 0)
-void ScaleARGBFilterCols_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
int xi = x >> 16;
int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
dst[0] = BLENDER(a, b, xf);
x += dx;
xi = x >> 16;
@@ -972,26 +976,26 @@
if (dst_width & 1) {
int xi = x >> 16;
int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
dst[0] = BLENDER(a, b, xf);
}
}
-void ScaleARGBFilterCols64_C(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x32,
int dx) {
- int64 x = (int64)(x32);
- const uint32* src = (const uint32*)(src_argb);
- uint32* dst = (uint32*)(dst_argb);
+ int64_t x = (int64_t)(x32);
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
int j;
for (j = 0; j < dst_width - 1; j += 2) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
dst[0] = BLENDER(a, b, xf);
x += dx;
xi = x >> 16;
@@ -1003,10 +1007,10 @@
dst += 2;
}
if (dst_width & 1) {
- int64 xi = x >> 16;
+ int64_t xi = x >> 16;
int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
+ uint32_t a = src[xi];
+ uint32_t b = src[xi + 1];
dst[0] = BLENDER(a, b, xf);
}
}
@@ -1020,8 +1024,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint8* src_argb,
- uint8* dst_argb,
+ const uint8_t* src_argb,
+ uint8_t* dst_argb,
int x,
int y,
int dy,
@@ -1029,7 +1033,7 @@
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher bpp.
int dst_width_bytes = dst_width * bpp;
- void (*InterpolateRow)(uint8 * dst_argb, const uint8* src_argb,
+ void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1063,16 +1067,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
- IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_DSPR2;
- if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_DSPR2;
- }
- }
-#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
@@ -1081,6 +1075,14 @@
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_MMI)
+ if (TestCpuFlag(kCpuHasMMI)) {
+ InterpolateRow = InterpolateRow_Any_MMI;
+ if (IS_ALIGNED(dst_width_bytes, 8)) {
+ InterpolateRow = InterpolateRow_MMI;
+ }
+ }
+#endif
for (j = 0; j < dst_height; ++j) {
int yi;
int yf;
@@ -1100,8 +1102,8 @@
int dst_height,
int src_stride,
int dst_stride,
- const uint16* src_argb,
- uint16* dst_argb,
+ const uint16_t* src_argb,
+ uint16_t* dst_argb,
int x,
int y,
int dy,
@@ -1109,7 +1111,7 @@
enum FilterMode filtering) {
// TODO(fbarchard): Allow higher wpp.
int dst_width_words = dst_width * wpp;
- void (*InterpolateRow)(uint16 * dst_argb, const uint16* src_argb,
+ void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_16_C;
const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
@@ -1151,16 +1153,6 @@
}
}
#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
- if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_argb, 4) &&
- IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_argb, 4) &&
- IS_ALIGNED(dst_stride, 4)) {
- InterpolateRow = InterpolateRow_Any_16_DSPR2;
- if (IS_ALIGNED(dst_width_bytes, 4)) {
- InterpolateRow = InterpolateRow_16_DSPR2;
- }
- }
-#endif
for (j = 0; j < dst_height; ++j) {
int yi;
int yf;
@@ -1222,12 +1214,12 @@
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_C(int num, int div) {
- return (int)(((int64)(num) << 16) / div);
+ return (int)(((int64_t)(num) << 16) / div);
}
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv1_C(int num, int div) {
- return (int)((((int64)(num) << 16) - 0x00010001) / (div - 1));
+ return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
}
#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
@@ -1306,6 +1298,35 @@
}
#undef CENTERSTART
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src2 = src_ptr + src_stride;
+
+ int x;
+ for (x = 0; x < dst_width - 1; x += 2) {
+ uint16_t p0 = src_ptr[0];
+ uint16_t p1 = src_ptr[1];
+ uint16_t p2 = src2[0];
+ uint16_t p3 = src2[1];
+ dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+ dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+ ++src_ptr;
+ ++src2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ uint16_t p0 = src_ptr[0];
+ uint16_t p1 = src_ptr[1];
+ uint16_t p2 = src2[0];
+ uint16_t p3 = src2[1];
+ dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_gcc.cc b/files/source/scale_gcc.cc
index f0ac56f..90a49f3 100644
--- a/files/source/scale_gcc.cc
+++ b/files/source/scale_gcc.cc
@@ -21,462 +21,458 @@
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+ 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
- 10, 11, 12, 13, 13, 14, 14, 15};
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
-static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
-static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
- 6, 8, 11, 14, 128, 128, 128, 128};
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
- 6, 7, 12, 13, 128, 128, 128, 128};
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
- 65536 / 9, 65536 / 6, 0, 0};
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
- 11, 128, 14, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
- 12, 128, 15, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
- 13, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
- 65536 / 3, 65536 / 2, 0, 0};
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
-void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
+ asm volatile(
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "psrlw $0x1,%%xmm0 \n"
- "psrlw $0x1,%%xmm1 \n"
- "pavgw %%xmm5,%%xmm0 \n"
- "pavgw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "psrlw $0x1,%%xmm0 \n"
+ "psrlw $0x1,%%xmm1 \n"
+ "pavgw %%xmm5,%%xmm0 \n"
+ "pavgw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr,
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
- "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
+ "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
+ asm volatile(
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
}
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
intptr_t stridex3;
- asm volatile (
- "pcmpeqb %%xmm4,%%xmm4 \n"
- "psrlw $0xf,%%xmm4 \n"
- "movdqa %%xmm4,%%xmm5 \n"
- "packuswb %%xmm4,%%xmm4 \n"
- "psllw $0x3,%%xmm5 \n"
- "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
+ asm volatile(
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrlw $0xf,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm4,%%xmm4 \n"
+ "psllw $0x3,%%xmm5 \n"
+ "lea 0x00(%4,%4,2),%3 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
- "pmaddubsw %%xmm4,%%xmm0 \n"
- "pmaddubsw %%xmm4,%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
- MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pmaddubsw %%xmm4,%%xmm2 \n"
- "pmaddubsw %%xmm4,%%xmm3 \n"
- "paddw %%xmm2,%%xmm0 \n"
- "paddw %%xmm3,%%xmm1 \n"
- "phaddw %%xmm1,%%xmm0 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "psrlw $0x4,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x8,1) ",%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "=&r"(stridex3) // %3
- : "r"((intptr_t)(src_stride)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%4,2),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,2),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "=&r"(stridex3) // %3
+ : "r"((intptr_t)(src_stride)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr,
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
- "vpsrld $0x18,%%ymm5,%%ymm5 \n"
- "vpslld $0x10,%%ymm5,%%ymm5 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpand %%ymm5,%%ymm0,%%ymm0 \n"
- "vpand %%ymm5,%%ymm1,%%ymm1 \n"
- "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm5"
- );
+ asm volatile(
+ "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
+ "vpsrld $0x18,%%ymm5,%%ymm5 \n"
+ "vpslld $0x10,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpand %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpand %%ymm5,%%ymm1,%%ymm1 \n"
+ "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm5");
}
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
- "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
- "vpsllw $0x3,%%ymm4,%%ymm5 \n"
- "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
+ asm volatile(
+ "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
+ "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
+ "vpsllw $0x3,%%ymm4,%%ymm5 \n"
+ "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
- "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
- "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
- "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
- MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
- "lea " MEMLEA(0x40,0) ",%0 \n"
- "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
- "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
- "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
- "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
- "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
- "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
- "vpermq $0xd8,%%ymm0,%%ymm0 \n"
- "vmovdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "r"((intptr_t)(src_stride * 3)) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm0 \n"
+ "vmovdqu 0x20(%0),%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
+ "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
+ "vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
+ "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
+ "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
+ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vmovdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(src_stride * 3)) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
@@ -487,34 +483,35 @@
: "m"(kShuf0), // %0
"m"(kShuf1), // %1
"m"(kShuf2) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
- "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
@@ -524,7 +521,7 @@
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
- );
+ );
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
@@ -533,54 +530,54 @@
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
@@ -590,7 +587,7 @@
: "m"(kShuf01), // %0
"m"(kShuf11), // %1
"m"(kShuf21) // %2
- );
+ );
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
@@ -599,90 +596,89 @@
: "m"(kMadd01), // %0
"m"(kMadd11), // %1
"m"(kRound34) // %2
- );
-
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS(1) " \n"
- "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
- MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x18,1) ",%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
+
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm6 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqu 0x10(%0),%%xmm6 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
-void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
+ asm volatile(
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(1) " \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
- "lea " MEMLEA(0xc,1) ",%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "m"(kShuf38a), // %3
- "m"(kShuf38b) // %4
- : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
@@ -694,41 +690,40 @@
"m"(kShufAb1), // %1
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "pavgb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1," MEMACCESS(1) " \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
- "lea " MEMLEA(0x6,1) ",%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
@@ -739,530 +734,534 @@
: "m"(kShufAc), // %0
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
- );
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
- "lea " MEMLEA(0x10,0) ",%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movd %%xmm6," MEMACCESS(1) " \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
- "lea " MEMLEA(0x6,1) ",%1 \n"
- "sub $0x6,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqu 0x00(%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "sub $0x6,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
// Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
- asm volatile (
- "pxor %%xmm5,%%xmm5 \n"
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm3 \n"
- "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
- "movdqa %%xmm3,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm2 \n"
- "punpckhbw %%xmm5,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
- "lea " MEMLEA(0x20,1) ",%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 16 pixel loop.
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n" // src_ptr += 16
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu 0x10(%1),%%xmm1 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
- asm volatile (
- "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
- LABELALIGN
- "1: \n"
- "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
- "vpermq $0xd8,%%ymm3,%%ymm3 \n"
- "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
- "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
- "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
- "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
- "vmovdqu %%ymm0," MEMACCESS(1) " \n"
- "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
- "lea " MEMLEA(0x40,1) ",%1 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
- "vzeroupper \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_width) // %2
- :
- : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
- );
+ "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
+
+ LABELALIGN
+ "1: \n"
+ "vmovdqu (%0),%%ymm3 \n"
+ "lea 0x20(%0),%0 \n" // src_ptr += 32
+ "vpermq $0xd8,%%ymm3,%%ymm3 \n"
+ "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
+ "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
+ "vpaddusw (%1),%%ymm2,%%ymm0 \n"
+ "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
+ "vmovdqu %%ymm0,(%1) \n"
+ "vmovdqu %%ymm1,0x20(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
-static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
- 0x4040, 0x4040, 0x4040, 0x4040};
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
intptr_t x0, x1, temp_pixel;
- asm volatile (
- "movd %6,%%xmm2 \n"
- "movd %7,%%xmm3 \n"
- "movl $0x04040000,%k2 \n"
- "movd %k2,%%xmm5 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n" // 0x007f007f
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $15,%%xmm7 \n" // 0x00010001
+ asm volatile(
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n" // 0x007f007f
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $15,%%xmm7 \n" // 0x00010001
- "pextrw $0x1,%%xmm2,%k3 \n"
- "subl $0x2,%5 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
- "movd %k2,%%xmm4 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "punpcklwd %%xmm4,%%xmm0 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
- "paddusb %%xmm7,%%xmm1 \n"
- "pmaddubsw %%xmm0,%%xmm1 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "paddw %9,%%xmm1 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "movd %%xmm1,%k2 \n"
- "mov %w2," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x2,0) ",%0 \n"
- "subl $0x2,%5 \n"
- "jge 2b \n"
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movzwl 0x00(%1,%4,1),%k2 \n"
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) +
+ // 1
+ "paddusb %%xmm7,%%xmm1 \n"
+ "pmaddubsw %%xmm0,%%xmm1 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "paddw %9,%%xmm1 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movd %%xmm1,%k2 \n"
+ "mov %w2,(%0) \n"
+ "lea 0x2(%0),%0 \n"
+ "subl $0x2,%5 \n"
+ "jge 2b \n"
- LABELALIGN
- "29: \n"
- "addl $0x1,%5 \n"
- "jl 99f \n"
- MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
- "movd %k2,%%xmm0 \n"
- "psrlw $0x9,%%xmm2 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "psubb %8,%%xmm0 \n" // make pixels signed.
- "pxor %%xmm6,%%xmm2 \n"
- "paddusb %%xmm7,%%xmm2 \n"
- "pmaddubsw %%xmm0,%%xmm2 \n"
- "paddw %9,%%xmm2 \n" // make pixels unsigned.
- "psrlw $0x7,%%xmm2 \n"
- "packuswb %%xmm2,%%xmm2 \n"
- "movd %%xmm2,%k2 \n"
- "mov %b2," MEMACCESS(0) " \n"
- "99: \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "=&a"(temp_pixel), // %2
- "=&r"(x0), // %3
- "=&r"(x1), // %4
+ LABELALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ "movzwl 0x00(%1,%3,1),%k2 \n"
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "psubb %8,%%xmm0 \n" // make pixels signed.
+ "pxor %%xmm6,%%xmm2 \n"
+ "paddusb %%xmm7,%%xmm2 \n"
+ "pmaddubsw %%xmm0,%%xmm2 \n"
+ "paddw %9,%%xmm2 \n" // make pixels unsigned.
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movd %%xmm2,%k2 \n"
+ "mov %b2,(%0) \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "=&a"(temp_pixel), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1), // %4
#if defined(__x86_64__)
- "+rm"(dst_width) // %5
+ "+rm"(dst_width) // %5
#else
- "+m"(dst_width) // %5
+ "+m"(dst_width) // %5
#endif
- : "rm"(x), // %6
- "rm"(dx), // %7
+ : "rm"(x), // %6
+ "rm"(dx), // %7
#if defined(__x86_64__)
- "x"(kFsub80), // %8
- "x"(kFadd40) // %9
+ "x"(kFsub80), // %8
+ "x"(kFadd40) // %9
#else
- "m"(kFsub80), // %8
- "m"(kFadd40) // %9
+ "m"(kFsub80), // %8
+ "m"(kFadd40) // %9
#endif
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
- );
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+ "xmm7");
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
(void)x;
(void)dx;
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm0,%%xmm0 \n"
- "punpckhbw %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(0) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "sub $0x20,%2 \n"
- "jg 1b \n"
+ asm volatile(
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", "xmm0", "xmm1"
- );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(0) ",%%xmm0 \n"
- "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
- MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
- MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(1) " \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "sub $0x4,%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((intptr_t)(src_stride)) // %3
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3"
- );
+ asm volatile(
+
+ LABELALIGN
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x00(%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((intptr_t)(src_stride)) // %3
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
(void)src_stride;
- asm volatile (
- "lea " MEMLEA3(0x00,1,4) ",%1 \n"
- "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
- LABELALIGN
- "1: \n"
- "movd " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
- "punpckldq %%xmm1,%%xmm0 \n"
- MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
- MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
- "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width), // %3
- "=&r"(src_stepx_x12) // %4
- :: "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3"
- );
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+
+ LABELALIGN
+ "1: \n"
+ "movd (%0),%%xmm0 \n"
+ "movd 0x00(%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%0,%1,2),%%xmm2 \n"
+ "movd 0x00(%0,%4,1),%%xmm3 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "=&r"(src_stepx_x12) // %4
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
- asm volatile (
- "lea " MEMLEA3(0x00,1,4) ",%1 \n"
- "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
- "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
+ asm volatile(
+ "lea 0x00(,%1,4),%1 \n"
+ "lea 0x00(%1,%1,2),%4 \n"
+ "lea 0x00(%0,%5,1),%5 \n"
- LABELALIGN
- "1: \n"
- "movq " MEMACCESS(0) ",%%xmm0 \n"
- MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
- MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
- MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
- "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
- "movq " MEMACCESS(5) ",%%xmm2 \n"
- MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
- MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
- MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
- "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%3 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+rm"(dst_width), // %3
- "=&r"(src_stepx_x12), // %4
- "+r"(row1) // %5
- :: "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3"
- );
+ LABELALIGN
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps 0x00(%0,%1,1),%%xmm0 \n"
+ "movq 0x00(%0,%1,2),%%xmm1 \n"
+ "movhps 0x00(%0,%4,1),%%xmm1 \n"
+ "lea 0x00(%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps 0x00(%5,%1,1),%%xmm2 \n"
+ "movq 0x00(%5,%1,2),%%xmm3 \n"
+ "movhps 0x00(%5,%4,1),%%xmm3 \n"
+ "lea 0x00(%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "=&r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ ::"memory",
+ "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
-void ScaleARGBCols_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
intptr_t x0, x1;
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x11,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x5,%%xmm3,%%xmm0 \n"
- "paddd %%xmm0,%%xmm2 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "cmp $0x0,%4 \n"
- "jl 99f \n"
- "sub $0x4,%4 \n"
- "jl 49f \n"
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
- LABELALIGN
- "40: \n"
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
- "pextrw $0x5,%%xmm2,%k0 \n"
- "pextrw $0x7,%%xmm2,%k1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
- MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
- "pextrw $0x1,%%xmm2,%k0 \n"
- "pextrw $0x3,%%xmm2,%k1 \n"
- "punpckldq %%xmm4,%%xmm1 \n"
- "punpcklqdq %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x10,2) ",%2 \n"
- "sub $0x4,%4 \n"
- "jge 40b \n"
+ LABELALIGN
+ "40: \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd 0x00(%3,%0,4),%%xmm1 \n"
+ "movd 0x00(%3,%1,4),%%xmm4 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%4 \n"
+ "jge 40b \n"
- "49: \n"
- "test $0x2,%4 \n"
- "je 29f \n"
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
- "pextrw $0x5,%%xmm2,%k0 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(2) " \n"
- "lea " MEMLEA(0x8,2) ",%2 \n"
- "29: \n"
- "test $0x1,%4 \n"
- "je 99f \n"
- MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
- "movd %%xmm0," MEMACCESS(2) " \n"
- "99: \n"
- : "=&a"(x0), // %0
- "=&d"(x1), // %1
- "+r"(dst_argb), // %2
- "+r"(src_argb), // %3
- "+r"(dst_width) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
- );
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd 0x00(%3,%1,4),%%xmm1 \n"
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%2) \n"
+ "lea 0x8(%2),%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ "movd 0x00(%3,%0,4),%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "99: \n"
+ : "=&a"(x0), // %0
+ "=&d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
(void)x;
(void)dx;
- asm volatile (
- LABELALIGN
- "1: \n"
- "movdqu " MEMACCESS(1) ",%%xmm0 \n"
- "lea " MEMLEA(0x10,1) ",%1 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpckldq %%xmm0,%%xmm0 \n"
- "punpckhdq %%xmm1,%%xmm1 \n"
- "movdqu %%xmm0," MEMACCESS(0) " \n"
- "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
- "lea " MEMLEA(0x20,0) ",%0 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
+ asm volatile(
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width) // %2
- :: "memory", "cc", NACL_R14
- "xmm0", "xmm1"
- );
+ LABELALIGN
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "movdqu %%xmm0,(%0) \n"
+ "movdqu %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ ::"memory",
+ "cc", "xmm0", "xmm1");
}
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
+static const uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
+static const uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
@@ -1273,69 +1272,67 @@
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
- );
-
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- LABELALIGN
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
- "psrlw $0x9,%%xmm1 \n"
- MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0," MEMACCESS(0) " \n"
- "lea " MEMLEA(0x8,0) ",%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
-
- LABELALIGN
- "29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0," MEMACCESS(0) " \n"
-
- LABELALIGN
- "99: \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+rm"(dst_width), // %2
- "=&r"(x0), // %3
- "=&r"(x1) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc", NACL_R14
- "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
+
+ asm volatile(
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ LABELALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps 0x00(%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ LABELALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq 0x00(%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+
+ LABELALIGN "99: \n" // clang-format error.
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "=&r"(x0), // %3
+ "=&r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
// Divide num by div and return as 16.16 fixed point result.
diff --git a/files/source/scale_mmi.cc b/files/source/scale_mmi.cc
new file mode 100644
index 0000000..990463c
--- /dev/null
+++ b/files/source/scale_mmi.cc
@@ -0,0 +1,1113 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Mips MMI.
+#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+// clang-format off
+
+// CPU agnostic row functions
+void ScaleRowDown2_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+ const uint64_t shift = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlh %[src0], %[src0], %[shift] \n\t"
+
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlh %[src1], %[src1], %[shift] \n\t"
+
+ "packushb %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t mask = 0x00ff00ff00ff00ffULL;
+ const uint64_t shift = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "and %[dest0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "and %[dest1], %[src1], %[mask] \n\t"
+ "packushb %[dest0], %[dest0], %[dest1] \n\t"
+
+ "psrlh %[src0], %[src0], %[shift] \n\t"
+ "psrlh %[src1], %[src1], %[shift] \n\t"
+ "packushb %[dest1], %[src0], %[src1] \n\t"
+
+ "pavgb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0),
+ [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask),
+ [shift] "f"(shift), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleRowDown2Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+
+ uint64_t s0, s1, t0, t1;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t mask = 0x00ff00ff00ff00ffULL;
+ const uint64_t shift0 = 0x2ULL;
+ const uint64_t shift1 = 0x8ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "psrlh %[s1], %[s0], %[shift1] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "psrlh %[t1], %[t0], %[shift1] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddh %[dest0], %[s0], %[s1] \n\t"
+ "paddh %[dest0], %[dest0], %[t0] \n\t"
+ "paddh %[dest0], %[dest0], %[t1] \n\t"
+ "paddh %[dest0], %[dest0], %[ph] \n\t"
+ "psrlh %[dest0], %[dest0], %[shift0] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "psrlh %[s1], %[s0], %[shift1] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "psrlh %[t1], %[t0], %[shift1] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddh %[dest1], %[s0], %[s1] \n\t"
+ "paddh %[dest1], %[dest1], %[t0] \n\t"
+ "paddh %[dest1], %[dest1], %[t1] \n\t"
+ "paddh %[dest1], %[dest1], %[ph] \n\t"
+ "psrlh %[dest1], %[dest1], %[shift0] \n\t"
+
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleARGBRowDown2_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpckhwd %[dest], %[src0], %[src1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "punpcklwd %[dest_lo], %[src0], %[src1] \n\t"
+ "lwc1 %[src0], 0x04(%[src_ptr]) \n\t"
+ "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t"
+ "punpcklwd %[dest_hi], %[src0], %[src1] \n\t"
+
+ "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint8_t* s = src_argb;
+ const uint8_t* t = src_argb + src_stride;
+
+ uint64_t s0, s_hi, s_lo;
+ uint64_t t0, t_hi, t_lo;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t mask = 0x0ULL;
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t shfit = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
+ "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
+ "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
+ "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t"
+
+ "paddh %[dest_lo], %[dest_lo], %[ph] \n\t"
+ "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "punpcklbh %[s_lo], %[s0], %[mask] \n\t"
+ "punpckhbh %[s_hi], %[s0], %[mask] \n\t"
+ "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "punpcklbh %[t_lo], %[t0], %[mask] \n\t"
+ "punpckhbh %[t_hi], %[t0], %[mask] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t"
+
+ "paddh %[dest_hi], %[dest_hi], %[ph] \n\t"
+ "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo),
+ [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+ [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit)
+ : "memory");
+}
+
+void ScaleRowDown2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+ const uint64_t shift = 0x10ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+
+ "packsswh %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift)
+ : "memory");
+}
+
+void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpcklhw %[dest_lo], %[src0], %[src1] \n\t"
+ "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
+
+ "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t"
+ "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t"
+
+ "pavgh %[dest], %[src0], %[src1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* s = src_ptr;
+ const uint16_t* t = src_ptr + src_stride;
+
+ uint64_t s0, s1, s_hi, s_lo;
+ uint64_t t0, t1, t_hi, t_lo;
+ uint64_t dest, dest0, dest1;
+
+ const uint64_t ph = 0x0000000200000002ULL;
+ const uint64_t mask = 0x0000ffff0000ffffULL;
+ const uint64_t shift0 = 0x10ULL;
+ const uint64_t shift1 = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[s0], 0x00(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x07(%[s]) \n\t"
+ "psrlw %[s1], %[s0], %[shift0] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x00(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x07(%[t]) \n\t"
+ "psrlw %[t1], %[t0], %[shift0] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddw %[dest0], %[s0], %[s1] \n\t"
+ "paddw %[dest0], %[dest0], %[t0] \n\t"
+ "paddw %[dest0], %[dest0], %[t1] \n\t"
+ "paddw %[dest0], %[dest0], %[ph] \n\t"
+ "psrlw %[dest0], %[dest0], %[shift1] \n\t"
+
+ "gsldrc1 %[s0], 0x08(%[s]) \n\t"
+ "gsldlc1 %[s0], 0x0f(%[s]) \n\t"
+ "psrlw %[s1], %[s0], %[shift0] \n\t"
+ "and %[s0], %[s0], %[mask] \n\t"
+
+ "gsldrc1 %[t0], 0x08(%[t]) \n\t"
+ "gsldlc1 %[t0], 0x0f(%[t]) \n\t"
+ "psrlw %[t1], %[t0], %[shift0] \n\t"
+ "and %[t0], %[t0], %[mask] \n\t"
+
+ "paddw %[dest1], %[s0], %[s1] \n\t"
+ "paddw %[dest1], %[dest1], %[t0] \n\t"
+ "paddw %[dest1], %[dest1], %[t1] \n\t"
+ "paddw %[dest1], %[dest1], %[ph] \n\t"
+ "psrlw %[dest1], %[dest1], %[shift1] \n\t"
+
+ "packsswh %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[s], %[s], 0x10 \n\t"
+ "daddiu %[t], %[t], 0x10 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1),
+ [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi),
+ [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1),
+ [dest] "=&f"(dest)
+ : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleRowDown4_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t shift = 0x10ULL;
+ const uint64_t mask = 0x000000ff000000ffULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+ "and %[src0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+ "and %[src1], %[src1], %[mask] \n\t"
+ "packsswh %[dest_lo], %[src0], %[src1] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
+ "psrlw %[src0], %[src0], %[shift] \n\t"
+ "and %[src0], %[src0], %[mask] \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
+ "psrlw %[src1], %[src1], %[shift] \n\t"
+ "and %[src1], %[src1], %[mask] \n\t"
+ "packsswh %[dest_hi], %[src0], %[src1] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [shift] "f"(shift), [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleRowDown4_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest_hi, dest_lo;
+
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t"
+ "punpckhhw %[dest_lo], %[src0], %[src1] \n\t"
+ "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t"
+
+ "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t"
+ "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t"
+ "punpckhhw %[dest_hi], %[src0], %[src1] \n\t"
+ "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi),
+ [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+
+#define DO_SCALEROWDOWN4BOX_LOOP(reg) \
+ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
+ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
+ \
+ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_PUNPCKADD() \
+ \
+ "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \
+ "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \
+ "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \
+ "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \
+ "paddh " #reg ", " #reg ", %[ph] \n\t" \
+ "psrlh " #reg ", " #reg ", %[shift] \n\t" \
+ \
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
+ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box */
+void ScaleRowDown4Box_MMI(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ const uint8_t* src0_ptr = src_ptr;
+ const uint8_t* src1_ptr = src_ptr + src_stride;
+ const uint8_t* src2_ptr = src_ptr + src_stride * 2;
+ const uint8_t* src3_ptr = src_ptr + src_stride * 3;
+
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+ const uint64_t mask0 = 0x0ULL;
+ const uint64_t mask1 = 0x0001000100010001ULL;
+ const uint64_t ph = 0x0008000800080008ULL;
+ const uint64_t shift = 0x4ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest0])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest1])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest2])
+ DO_SCALEROWDOWN4BOX_LOOP(%[dest3])
+
+ "packsswh %[dest_lo], %[dest0], %[dest1] \n\t"
+ "packsswh %[dest_hi], %[dest2], %[dest3] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+ [ph] "f"(ph), [mask1] "f"(mask1)
+ : "memory");
+}
+
+#define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+
+#define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \
+ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \
+ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \
+ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \
+ \
+ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \
+ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \
+ \
+ "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \
+ "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \
+ "paddw %[dest], %[dest_hi], %[dest] \n\t" \
+ "paddw %[dest], %[dest], %[ph] \n\t" \
+ "psraw %[dest], %[dest], %[shift] \n\t" \
+ "and " #reg ", %[dest], %[mask1] \n\t" \
+ \
+ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \
+ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t"
+
+/* LibYUVScaleTest.ScaleDownBy4_Box_16 */
+void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src0_ptr = src_ptr;
+ const uint16_t* src1_ptr = src_ptr + src_stride;
+ const uint16_t* src2_ptr = src_ptr + src_stride * 2;
+ const uint16_t* src3_ptr = src_ptr + src_stride * 3;
+
+ uint64_t src, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3;
+
+ const uint64_t mask0 = 0x0ULL;
+ const uint64_t mask1 = 0x00000000ffffffffULL;
+ const uint64_t ph = 0x0000000800000008ULL;
+ const uint64_t shift = 0x04ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2])
+ DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3])
+ "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t"
+ "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t"
+
+ "packushb %[dest], %[dest_lo], %[dest_hi] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2),
+ [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0),
+ [ph] "f"(ph), [mask1] "f"(mask1)
+ : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_MMI(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest;
+
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src], 0x00(%[src_ptr]) \n\t"
+
+ "punpcklbh %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleColsUp2_16_MMI(uint16_t* dst_ptr,
+ const uint16_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest;
+
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+
+ "punpcklhw %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "punpckhhw %[dest], %[src], %[src] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src] "=&f"(src), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+ uint64_t src, src_hi, src_lo, dest0, dest1;
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src], %[mask] \n\t"
+ "punpckhbh %[src_hi], %[src], %[mask] \n\t"
+
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "paddush %[dest0], %[dest0], %[src_lo] \n\t"
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "paddush %[dest1], %[dest1], %[src_hi] \n\t"
+
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleAddRow_16_MMI(const uint16_t* src_ptr,
+ uint32_t* dst_ptr,
+ int src_width) {
+ uint64_t src, src_hi, src_lo, dest0, dest1;
+ const uint64_t mask = 0x0ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "punpcklhw %[src_lo], %[src], %[mask] \n\t"
+ "punpckhhw %[src_hi], %[src], %[mask] \n\t"
+
+ "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "paddw %[dest0], %[dest0], %[src_lo] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+
+ "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+ "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "paddw %[dest1], %[dest1], %[src_hi] \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi),
+ [src_lo] "=&f"(src_lo), [src] "=&f"(src)
+ : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width),
+ [mask] "f"(mask)
+ : "memory");
+}
+
+void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ (void)src_stride;
+
+ uint64_t src0, src1, dest;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "lwc1 %[src0], 0x00(%[src_ptr]) \n\t"
+ "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
+ "lwc1 %[src1], 0x00(%[src_ptr]) \n\t"
+ "punpcklwd %[dest], %[src0], %[src1] \n\t"
+
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb),
+ [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width)
+ : "memory");
+}
+
+void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8_t* dst_argb,
+ int dst_width) {
+ const uint8_t* src0_ptr = src_argb;
+ const uint8_t* src1_ptr = src_argb + src_stride;
+
+ uint64_t src0, src1, src_hi, src_lo;
+ uint64_t dest, dest_hi, dest_lo, dest0, dest1;
+
+ const uint64_t mask = 0x0ULL;
+ const uint64_t ph = 0x0002000200020002ULL;
+ const uint64_t shift = 0x2ULL;
+
+ __asm__ volatile(
+ "1: \n\t"
+
+ "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
+ "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
+
+ "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
+ "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+ "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t"
+ "paddh %[dest0], %[dest0], %[ph] \n\t"
+ "psrlh %[dest0], %[dest0], %[shift] \n\t"
+
+ "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
+ "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
+
+ "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_lo], %[src0], %[mask] \n\t"
+ "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t"
+ "punpcklbh %[dest_hi], %[src0], %[mask] \n\t"
+
+ "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_lo], %[src1], %[mask] \n\t"
+ "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t"
+ "punpcklbh %[src_hi], %[src1], %[mask] \n\t"
+ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t"
+ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t"
+ "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t"
+ "paddh %[dest1], %[dest1], %[ph] \n\t"
+ "psrlh %[dest1], %[dest1], %[shift] \n\t"
+
+ "packushb %[dest], %[dest0], %[dest1] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t"
+ "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t"
+ "daddi %[width], %[width], -0x02 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo),
+ [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo),
+ [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0),
+ [src1] "=&f"(src1), [dest] "=&f"(dest)
+ : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr),
+ [dst_ptr] "r"(dst_argb), [width] "r"(dst_width),
+ [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask),
+ [ph] "f"(ph)
+ : "memory");
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+
+ const uint32_t* src_tmp;
+
+ uint64_t dest, offset;
+
+ const uint64_t shift0 = 16;
+ const uint64_t shift1 = 2;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "srav %[offset], %[x], %[shift0] \n\t"
+ "sllv %[offset], %[offset], %[shift1] \n\t"
+ "dadd %[src_tmp], %[src_ptr], %[offset] \n\t"
+ "lwc1 %[dest], 0x00(%[src_tmp]) \n\t"
+ "swc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ "dadd %[x], %[x], %[dx] \n\t"
+
+ "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t"
+ "daddi %[width], %[width], -0x01 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp)
+ : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width),
+ [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1)
+ : "memory");
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_MMI(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ uint64_t src, dest0, dest1;
+ (void)x;
+ (void)dx;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t"
+ "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t"
+ "punpcklwd %[dest0], %[src], %[src] \n\t"
+ "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t"
+ "punpckhwd %[dest1], %[src], %[src] \n\t"
+ "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x04 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src)
+ : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width)
+ : "memory");
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVBaseTest.TestFixedDiv */
+int FixedDiv_MIPS(int num, int div) {
+ int quotient = 0;
+ const int shift = 16;
+
+ asm(
+ "dsll %[num], %[num], %[shift] \n\t"
+ "ddiv %[num], %[div] \t\n"
+ "mflo %[quo] \t\n"
+ : [quo] "+&r"(quotient)
+ : [num] "r"(num), [div] "r"(div), [shift] "r"(shift));
+
+ return quotient;
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+/* LibYUVScaleTest.ARGBScaleTo320x240_Linear */
+int FixedDiv1_MIPS(int num, int div) {
+ int quotient = 0;
+ const int shift = 16;
+ const int val1 = 1;
+ const int64_t val11 = 0x00010001ULL;
+
+ asm(
+ "dsll %[num], %[num], %[shift] \n\t"
+ "dsub %[num], %[num], %[val11] \n\t"
+ "dsub %[div], %[div], %[val1] \n\t"
+ "ddiv %[num], %[div] \t\n"
+ "mflo %[quo] \t\n"
+ : [quo] "+&r"(quotient)
+ : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11),
+ [shift] "r"(shift));
+
+ return quotient;
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ const uint16_t* src2_ptr = src_ptr + src_stride;
+
+ uint64_t src0, src1;
+ uint64_t dest, dest04, dest15, dest26, dest37;
+ uint64_t tmp0, tmp1, tmp2, tmp3;
+
+ const uint64_t mask0 = 0x0003000900030009ULL;
+ const uint64_t mask1 = 0x0001000300010003ULL;
+ const uint64_t mask2 = 0x0009000300090003ULL;
+ const uint64_t mask3 = 0x0003000100030001ULL;
+ const uint64_t ph = 0x0000000800000008ULL;
+ const uint64_t shift = 4;
+
+ __asm__ volatile(
+ "1: \n\t"
+ "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t"
+ "pmaddhw %[dest04], %[src0], %[mask0] \n\t"
+ "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t"
+ "pmaddhw %[dest], %[src1], %[mask1] \n\t"
+ "paddw %[dest04], %[dest04], %[dest] \n\t"
+ "paddw %[dest04], %[dest04], %[ph] \n\t"
+ "psrlw %[dest04], %[dest04], %[shift] \n\t"
+
+ "pmaddhw %[dest15], %[src0], %[mask2] \n\t"
+ "pmaddhw %[dest], %[src1], %[mask3] \n\t"
+ "paddw %[dest15], %[dest15], %[dest] \n\t"
+ "paddw %[dest15], %[dest15], %[ph] \n\t"
+ "psrlw %[dest15], %[dest15], %[shift] \n\t"
+
+ "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t"
+ "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t"
+ "pmaddhw %[dest26], %[src0], %[mask0] \n\t"
+ "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t"
+ "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t"
+ "pmaddhw %[dest], %[src1], %[mask1] \n\t"
+ "paddw %[dest26], %[dest26], %[dest] \n\t"
+ "paddw %[dest26], %[dest26], %[ph] \n\t"
+ "psrlw %[dest26], %[dest26], %[shift] \n\t"
+
+ "pmaddhw %[dest37], %[src0], %[mask2] \n\t"
+ "pmaddhw %[dest], %[src1], %[mask3] \n\t"
+ "paddw %[dest37], %[dest37], %[dest] \n\t"
+ "paddw %[dest37], %[dest37], %[ph] \n\t"
+ "psrlw %[dest37], %[dest37], %[shift] \n\t"
+
+ /* tmp0 = ( 00 04 02 06 ) */
+ "packsswh %[tmp0], %[dest04], %[dest26] \n\t"
+ /* tmp1 = ( 01 05 03 07 ) */
+ "packsswh %[tmp1], %[dest15], %[dest37] \n\t"
+
+ /* tmp2 = ( 00 01 04 05 )*/
+ "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t"
+ /* tmp3 = ( 02 03 06 07 )*/
+ "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t"
+
+ /* ( 00 01 02 03 ) */
+ "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t"
+ "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t"
+
+ /* ( 04 05 06 07 ) */
+ "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t"
+ "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t"
+ "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t"
+
+ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t"
+ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t"
+ "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t"
+ "daddi %[width], %[width], -0x08 \n\t"
+ "bnez %[width], 1b \n\t"
+ : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04),
+ [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37),
+ [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2),
+ [tmp3] "=&f"(tmp3), [dest] "=&f"(dest)
+ : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst),
+ [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1),
+ [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph)
+ : "memory");
+}
+
+// clang-format on
+
+#endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/files/source/scale_msa.cc b/files/source/scale_msa.cc
index bfcd10f..482a521 100644
--- a/files/source/scale_msa.cc
+++ b/files/source/scale_msa.cc
@@ -21,6 +21,14 @@
extern "C" {
#endif
+#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
+ { \
+ out0[0] = srcp[indx0[0]]; \
+ out0[1] = srcp[indx0[1]]; \
+ out0[2] = srcp[indx0[2]]; \
+ out0[3] = srcp[indx0[3]]; \
+ }
+
void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
@@ -119,13 +127,13 @@
}
}
-void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
int x;
- const uint8* nxt_argb = src_argb + src_stride;
+ const uint8_t* nxt_argb = src_argb + src_stride;
int32_t stepx = src_stepx * 4;
int64_t data0, data1, data2, data3;
v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
@@ -545,6 +553,394 @@
}
}
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
+ int dst_width,
+ int x,
+ int dx) {
+ int j;
+ v4i32 vec_x = __msa_fill_w(x);
+ v4i32 vec_dx = __msa_fill_w(dx);
+ v4i32 vec_const = {0, 1, 2, 3};
+ v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+ v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8u16 reg0, reg1;
+ v16u8 dst0;
+ v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
+ v4i32 const_0x40 = __msa_fill_w(0x40);
+
+ vec0 = vec_dx * vec_const;
+ vec1 = vec_dx * 4;
+ vec_x += vec0;
+
+ for (j = 0; j < dst_width - 1; j += 16) {
+ vec2 = vec_x >> 16;
+ vec6 = vec_x & const_0xFFFF;
+ vec_x += vec1;
+ vec3 = vec_x >> 16;
+ vec7 = vec_x & const_0xFFFF;
+ vec_x += vec1;
+ vec4 = vec_x >> 16;
+ vec8 = vec_x & const_0xFFFF;
+ vec_x += vec1;
+ vec5 = vec_x >> 16;
+ vec9 = vec_x & const_0xFFFF;
+ vec_x += vec1;
+ vec6 >>= 9;
+ vec7 >>= 9;
+ vec8 >>= 9;
+ vec9 >>= 9;
+ LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
+ LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
+ LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
+ LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
+ vec2 += 1;
+ vec3 += 1;
+ vec4 += 1;
+ vec5 += 1;
+ LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
+ LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
+ LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
+ LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
+ tmp4 -= tmp0;
+ tmp5 -= tmp1;
+ tmp6 -= tmp2;
+ tmp7 -= tmp3;
+ tmp4 *= vec6;
+ tmp5 *= vec7;
+ tmp6 *= vec8;
+ tmp7 *= vec9;
+ tmp4 += const_0x40;
+ tmp5 += const_0x40;
+ tmp6 += const_0x40;
+ tmp7 += const_0x40;
+ tmp4 >>= 7;
+ tmp5 >>= 7;
+ tmp6 >>= 7;
+ tmp7 >>= 7;
+ tmp0 += tmp4;
+ tmp1 += tmp5;
+ tmp2 += tmp6;
+ tmp3 += tmp7;
+ reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+ reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ __msa_st_b(dst0, dst_ptr, 0);
+ dst_ptr += 16;
+ }
+}
+
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ uint32_t* dst = (uint32_t*)(dst_argb);
+ int j;
+ v4i32 x_vec = __msa_fill_w(x);
+ v4i32 dx_vec = __msa_fill_w(dx);
+ v4i32 const_vec = {0, 1, 2, 3};
+ v4i32 vec0, vec1, vec2;
+ v4i32 dst0;
+
+ vec0 = dx_vec * const_vec;
+ vec1 = dx_vec * 4;
+ x_vec += vec0;
+
+ for (j = 0; j < dst_width; j += 4) {
+ vec2 = x_vec >> 16;
+ x_vec += vec1;
+ LOAD_INDEXED_DATA(src, vec2, dst0);
+ __msa_st_w(dst0, dst, 0);
+ dst += 4;
+ }
+}
+
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+ const uint8_t* src_argb,
+ int dst_width,
+ int x,
+ int dx) {
+ const uint32_t* src = (const uint32_t*)(src_argb);
+ int j;
+ v4u32 src0, src1, src2, src3;
+ v4u32 vec0, vec1, vec2, vec3;
+ v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+ v16u8 mult0, mult1, mult2, mult3;
+ v8u16 tmp0, tmp1, tmp2, tmp3;
+ v16u8 dst0, dst1;
+ v4u32 vec_x = (v4u32)__msa_fill_w(x);
+ v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
+ v4u32 vec_const = {0, 1, 2, 3};
+ v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
+
+ vec0 = vec_dx * vec_const;
+ vec1 = vec_dx * 4;
+ vec_x += vec0;
+
+ for (j = 0; j < dst_width - 1; j += 8) {
+ vec2 = vec_x >> 16;
+ reg0 = (v16u8)(vec_x >> 9);
+ vec_x += vec1;
+ vec3 = vec_x >> 16;
+ reg1 = (v16u8)(vec_x >> 9);
+ vec_x += vec1;
+ reg0 = reg0 & const_0x7f;
+ reg1 = reg1 & const_0x7f;
+ reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
+ reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
+ reg2 = reg0 ^ const_0x7f;
+ reg3 = reg1 ^ const_0x7f;
+ mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
+ mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
+ mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
+ mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
+ LOAD_INDEXED_DATA(src, vec2, src0);
+ LOAD_INDEXED_DATA(src, vec3, src1);
+ vec2 += 1;
+ vec3 += 1;
+ LOAD_INDEXED_DATA(src, vec2, src2);
+ LOAD_INDEXED_DATA(src, vec3, src3);
+ reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+ reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+ reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+ reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+ tmp0 = __msa_dotp_u_h(reg4, mult0);
+ tmp1 = __msa_dotp_u_h(reg5, mult1);
+ tmp2 = __msa_dotp_u_h(reg6, mult2);
+ tmp3 = __msa_dotp_u_h(reg7, mult3);
+ tmp0 >>= 7;
+ tmp1 >>= 7;
+ tmp2 >>= 7;
+ tmp3 >>= 7;
+ dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+ __msa_st_b(dst0, dst_argb, 0);
+ __msa_st_b(dst1, dst_argb, 16);
+ dst_argb += 32;
+ }
+}
+
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* dst,
+ int dst_width) {
+ int x;
+ (void)src_stride;
+ v16u8 src0, src1, src2, src3;
+ v16u8 vec0, vec1, vec2;
+ v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
+ v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
+ v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
+ 21, 23, 24, 25, 27, 28, 29, 31};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+ vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+ vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
+ vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
+ __msa_st_b((v16i8)vec0, dst, 0);
+ __msa_st_b((v16i8)vec1, dst, 16);
+ __msa_st_b((v16i8)vec2, dst, 32);
+ src_ptr += 64;
+ dst += 48;
+ }
+}
+
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+ v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+ v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+ v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+ v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+ v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+ 16, 17, 17, 18, 18, 19, 20, 21};
+ v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+ v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+ v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+ v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+ vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+ vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+ vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+ vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+ vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+ vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+ vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+ vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+ vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+ reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+ reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+ reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+ reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+ reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+ reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+ reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+ reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+ reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+ reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+ reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+ reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+ reg0 = __msa_srar_h(reg0, shft0);
+ reg1 = __msa_srar_h(reg1, shft1);
+ reg2 = __msa_srar_h(reg2, shft2);
+ reg3 = __msa_srar_h(reg3, shft0);
+ reg4 = __msa_srar_h(reg4, shft1);
+ reg5 = __msa_srar_h(reg5, shft2);
+ reg6 = __msa_srar_h(reg6, shft0);
+ reg7 = __msa_srar_h(reg7, shft1);
+ reg8 = __msa_srar_h(reg8, shft2);
+ reg9 = __msa_srar_h(reg9, shft0);
+ reg10 = __msa_srar_h(reg10, shft1);
+ reg11 = __msa_srar_h(reg11, shft2);
+ reg0 = reg0 * 3 + reg6;
+ reg1 = reg1 * 3 + reg7;
+ reg2 = reg2 * 3 + reg8;
+ reg3 = reg3 * 3 + reg9;
+ reg4 = reg4 * 3 + reg10;
+ reg5 = reg5 * 3 + reg11;
+ reg0 = __msa_srari_h(reg0, 2);
+ reg1 = __msa_srari_h(reg1, 2);
+ reg2 = __msa_srari_h(reg2, 2);
+ reg3 = __msa_srari_h(reg3, 2);
+ reg4 = __msa_srari_h(reg4, 2);
+ reg5 = __msa_srari_h(reg5, 2);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ __msa_st_b((v16i8)dst0, d, 0);
+ __msa_st_b((v16i8)dst1, d, 16);
+ __msa_st_b((v16i8)dst2, d, 32);
+ s += 64;
+ t += 64;
+ d += 48;
+ }
+}
+
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint8_t* d,
+ int dst_width) {
+ const uint8_t* s = src_ptr;
+ const uint8_t* t = src_ptr + src_stride;
+ int x;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+ v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+ v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+ v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+ v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+ v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+ v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+ v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+ v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+ v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+ 16, 17, 17, 18, 18, 19, 20, 21};
+ v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+ v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+ v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+ v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+
+ for (x = 0; x < dst_width; x += 48) {
+ src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+ src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+ src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+ src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+ src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+ src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+ src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+ src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+ vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+ vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+ vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+ vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+ vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+ vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+ vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+ vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+ vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+ vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+ vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+ vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+ reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+ reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+ reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+ reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+ reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+ reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+ reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+ reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+ reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+ reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+ reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+ reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+ reg0 = __msa_srar_h(reg0, shft0);
+ reg1 = __msa_srar_h(reg1, shft1);
+ reg2 = __msa_srar_h(reg2, shft2);
+ reg3 = __msa_srar_h(reg3, shft0);
+ reg4 = __msa_srar_h(reg4, shft1);
+ reg5 = __msa_srar_h(reg5, shft2);
+ reg6 = __msa_srar_h(reg6, shft0);
+ reg7 = __msa_srar_h(reg7, shft1);
+ reg8 = __msa_srar_h(reg8, shft2);
+ reg9 = __msa_srar_h(reg9, shft0);
+ reg10 = __msa_srar_h(reg10, shft1);
+ reg11 = __msa_srar_h(reg11, shft2);
+ reg0 += reg6;
+ reg1 += reg7;
+ reg2 += reg8;
+ reg3 += reg9;
+ reg4 += reg10;
+ reg5 += reg11;
+ reg0 = __msa_srari_h(reg0, 1);
+ reg1 = __msa_srari_h(reg1, 1);
+ reg2 = __msa_srari_h(reg2, 1);
+ reg3 = __msa_srari_h(reg3, 1);
+ reg4 = __msa_srari_h(reg4, 1);
+ reg5 = __msa_srari_h(reg5, 1);
+ dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+ dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+ dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+ __msa_st_b((v16i8)dst0, d, 0);
+ __msa_st_b((v16i8)dst1, d, 16);
+ __msa_st_b((v16i8)dst2, d, 32);
+ s += 64;
+ t += 64;
+ d += 48;
+ }
+}
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/files/source/scale_neon.cc b/files/source/scale_neon.cc
index 9b4dce3..366b155 100644
--- a/files/source/scale_neon.cc
+++ b/files/source/scale_neon.cc
@@ -23,590 +23,529 @@
// Provided by Fritz Koenig
// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr,
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS(0)
- "vld2.8 {q0, q1}, [%0]! \n"
- "subs %2, %2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
);
}
// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
- "subs %2, %2, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #1 \n"
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
- MEMACCESS(1)
- "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
- "subs %3, %3, #16 \n" // 16 processed per loop
- "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
- "vpaddl.u8 q1, q1 \n"
- "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
- "vpadal.u8 q1, q3 \n"
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "q0", "q1", "q2", "q3" // Clobber List
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent +
+ // row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and
+ // pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
);
}
-void ScaleRowDown4_NEON(const uint8* src_ptr,
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "q0", "q1", "memory", "cc"
- );
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc");
}
-void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride;
- const uint8* src_ptr2 = src_ptr + src_stride * 2;
- const uint8* src_ptr3 = src_ptr + src_stride * 3;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {q0}, [%0]! \n" // load up 16x4
- MEMACCESS(3)
- "vld1.8 {q1}, [%3]! \n"
- MEMACCESS(4)
- "vld1.8 {q2}, [%4]! \n"
- MEMACCESS(5)
- "vld1.8 {q3}, [%5]! \n"
- "subs %2, %2, #4 \n"
- "vpaddl.u8 q0, q0 \n"
- "vpadal.u8 q0, q1 \n"
- "vpadal.u8 q0, q2 \n"
- "vpadal.u8 q0, q3 \n"
- "vpaddl.u16 q0, q0 \n"
- "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
- "vmovn.u16 d0, q0 \n"
- MEMACCESS(1)
- "vst1.32 {d0[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_ptr1), // %3
- "+r"(src_ptr2), // %4
- "+r"(src_ptr3) // %5
- :
- : "q0", "q1", "q2", "q3", "memory", "cc"
- );
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [%3]! \n"
+ "vld1.8 {q2}, [%4]! \n"
+ "vld1.8 {q3}, [%5]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc");
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "d0", "d1", "d2", "d3", "memory", "cc"
- );
+ asm volatile(
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc");
}
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "vmovl.u8 q8, d4 \n"
- "vmovl.u8 q9, d5 \n"
- "vmovl.u8 q10, d6 \n"
- "vmovl.u8 q11, d7 \n"
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
- // 3 * line_0 + line_1
- "vmlal.u8 q8, d0, d24 \n"
- "vmlal.u8 q9, d1, d24 \n"
- "vmlal.u8 q10, d2, d24 \n"
- "vmlal.u8 q11, d3, d24 \n"
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
- // (3 * line_0 + line_1) >> 2
- "vqrshrn.u16 d0, q8, #2 \n"
- "vqrshrn.u16 d1, q9, #2 \n"
- "vqrshrn.u16 d2, q10, #2 \n"
- "vqrshrn.u16 d3, q11, #2 \n"
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q8, d1 \n"
- "vmlal.u8 q8, d0, d24 \n"
- "vqrshrn.u16 d0, q8, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q8, d2 \n"
- "vmlal.u8 q8, d3, d24 \n"
- "vqrshrn.u16 d2, q8, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
- );
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+ "cc");
}
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
- "subs %2, %2, #24 \n"
- // average src line 0 with src line 1
- "vrhadd.u8 q0, q0, q2 \n"
- "vrhadd.u8 q1, q1, q3 \n"
+ asm volatile(
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "vmovl.u8 q3, d1 \n"
- "vmlal.u8 q3, d0, d24 \n"
- "vqrshrn.u16 d0, q3, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "vrhadd.u8 d1, d1, d2 \n"
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "vmovl.u8 q3, d2 \n"
- "vmlal.u8 q3, d3, d24 \n"
- "vqrshrn.u16 d2, q3, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
- MEMACCESS(1)
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
- );
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
}
#define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
-static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
- 18, 6, 14, 19, 0, 0, 0, 0};
-static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
-static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
+ 18, 6, 14, 19, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- MEMACCESS(3)
- "vld1.8 {q3}, [%3] \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- MEMACCESS(1)
- "vst1.8 {d4}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
- );
+ asm volatile(
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
}
// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
- asm volatile (
- MEMACCESS(5)
- "vld1.16 {q13}, [%5] \n"
- MEMACCESS(6)
- "vld1.8 {q14}, [%6] \n"
- MEMACCESS(7)
- "vld1.8 {q15}, [%7] \n"
- "add %3, %0 \n"
- "1: \n"
+ asm volatile(
+ "vld1.16 {q13}, [%5] \n"
+ "vld1.8 {q14}, [%6] \n"
+ "vld1.8 {q15}, [%7] \n"
+ "add %3, %0 \n"
+ "1: \n"
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- MEMACCESS(4)
- "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
- "subs %2, %2, #12 \n"
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
+ "subs %2, %2, #12 \n"
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
- "vtrn.u8 d16, d17 \n"
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
- "vtrn.u8 d18, d19 \n"
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
- "vpaddl.u8 q8, q8 \n"
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
- "vpaddl.u8 d19, d19 \n"
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 q0, q8 \n"
- "vadd.u16 d4, d3, d7 \n"
- "vadd.u16 d4, d19 \n"
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "vqrdmulh.s16 q2, q2, q13 \n"
- "vmovn.u16 d4, q2 \n"
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
- "vmovl.u8 q9, d18 \n"
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
- // combine source lines
- "vadd.u16 q1, q3 \n"
- "vadd.u16 q1, q9 \n"
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q15 \n"
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride), // %3
- "+r"(src_ptr1) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
- );
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+ "cc");
}
// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- MEMACCESS(4)
- "vld1.16 {q13}, [%4] \n"
- MEMACCESS(5)
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
- "1: \n"
+ asm volatile(
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
- // d0 = 00 40 01 41 02 42 03 43
- // d1 = 10 50 11 51 12 52 13 53
- // d2 = 20 60 21 61 22 62 23 63
- // d3 = 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
- MEMACCESS(3)
- "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "subs %2, %2, #12 \n"
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // d0 = 00 10 01 11 02 12 03 13
- // d1 = 40 50 41 51 42 52 43 53
- "vtrn.u8 d0, d1 \n"
- "vtrn.u8 d4, d5 \n"
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
- // d2 = 20 30 21 31 22 32 23 33
- // d3 = 60 70 61 71 62 72 63 73
- "vtrn.u8 d2, d3 \n"
- "vtrn.u8 d6, d7 \n"
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
- // d0 = 00+10 01+11 02+12 03+13
- // d2 = 40+50 41+51 42+52 43+53
- "vpaddl.u8 q0, q0 \n"
- "vpaddl.u8 q2, q2 \n"
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
- // d3 = 60+70 61+71 62+72 63+73
- "vpaddl.u8 d3, d3 \n"
- "vpaddl.u8 d7, d7 \n"
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
- // combine source lines
- "vadd.u16 q0, q2 \n"
- "vadd.u16 d4, d3, d7 \n"
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "vqrshrn.u16 d4, q2, #2 \n"
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "vmovl.u8 q1, d2 \n"
- "vmovl.u8 q3, d6 \n"
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
- // combine source lines
- "vadd.u16 q1, q3 \n"
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
- // d4 = xx 20 xx 30 xx 22 xx 32
- // d5 = xx 21 xx 31 xx 23 xx 33
- "vtrn.u32 d2, d3 \n"
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
- // d4 = xx 20 xx 21 xx 22 xx 23
- // d5 = xx 30 xx 31 xx 32 xx 33
- "vtrn.u16 d2, d3 \n"
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
- // 0+1+2, 3+4+5
- "vadd.u16 q0, q1 \n"
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "vqrdmulh.s16 q0, q0, q13 \n"
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "vmov.u8 d2, d4 \n"
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
- "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
- "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
- MEMACCESS(1)
- "vst1.8 {d3}, [%1]! \n"
- MEMACCESS(1)
- "vst1.32 {d4[0]}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "vld1.16 {q1, q2}, [%1] \n" // load accumulator
+ "vld1.8 {q0}, [%0]! \n" // load 16 bytes
+ "vaddw.u8 q2, q2, d1 \n" // add
+ "vaddw.u8 q1, q1, d0 \n"
+ "vst1.16 {q1, q2}, [%1]! \n" // store accumulator
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2" // Clobber List
);
}
-void ScaleAddRows_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint16* dst_ptr,
- int src_width,
- int src_height) {
- const uint8* src_tmp;
- asm volatile (
- "1: \n"
- "mov %0, %1 \n"
- "mov r12, %5 \n"
- "veor q2, q2, q2 \n"
- "veor q3, q3, q3 \n"
- "2: \n"
- // load 16 pixels into q0
- MEMACCESS(0)
- "vld1.8 {q0}, [%0], %3 \n"
- "vaddw.u8 q3, q3, d1 \n"
- "vaddw.u8 q2, q2, d0 \n"
- "subs r12, r12, #1 \n"
- "bgt 2b \n"
- MEMACCESS(2)
- "vst1.16 {q2, q3}, [%2]! \n" // store pixels
- "add %1, %1, #16 \n"
- "subs %4, %4, #16 \n" // 16 processed per loop
- "bgt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
- :
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA8_LANE(n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
-// clang-format on
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n"
// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-void ScaleFilterCols_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
- const uint8* src_tmp = src_ptr;
+ const uint8_t* src_tmp = src_ptr;
asm volatile (
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
@@ -643,7 +582,6 @@
"vadd.s16 q8, q8, q9 \n"
"vmovn.s16 d6, q8 \n"
- MEMACCESS(0)
"vst1.8 {d6}, [%0]! \n" // store pixels
"vadd.s32 q1, q1, q0 \n"
"vadd.s32 q2, q2, q0 \n"
@@ -665,351 +603,299 @@
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
- asm volatile (
- "cmp %4, #0 \n"
- "beq 100f \n"
- "add %2, %1 \n"
- "cmp %4, #64 \n"
- "beq 75f \n"
- "cmp %4, #128 \n"
- "beq 50f \n"
- "cmp %4, #192 \n"
- "beq 25f \n"
+ asm volatile(
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
- "vdup.8 d5, %4 \n"
- "rsb %4, #256 \n"
- "vdup.8 d4, %4 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vmull.u8 q13, d0, d4 \n"
- "vmull.u8 q14, d1, d4 \n"
- "vmlal.u8 q13, d2, d5 \n"
- "vmlal.u8 q14, d3, d5 \n"
- "vrshrn.u16 d0, q13, #8 \n"
- "vrshrn.u16 d1, q14, #8 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 1b \n"
- "b 99f \n"
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 25b \n"
- "b 99f \n"
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q1}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 50b \n"
- "b 99f \n"
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "vld1.8 {q1}, [%1]! \n"
- MEMACCESS(2)
- "vld1.8 {q0}, [%2]! \n"
- "subs %3, %3, #16 \n"
- "vrhadd.u8 q0, q1 \n"
- "vrhadd.u8 q0, q1 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 75b \n"
- "b 99f \n"
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "vld1.8 {q0}, [%1]! \n"
- "subs %3, %3, #16 \n"
- MEMACCESS(0)
- "vst1.8 {q0}, [%0]! \n"
- "bgt 100b \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
- "99: \n"
- MEMACCESS(0)
- "vst1.8 {d1[7]}, [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction) // %4
- :
- : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
- );
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
}
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS(0)
- "vld2.32 {q0, q1}, [%0]! \n"
- MEMACCESS(0)
- "vld2.32 {q2, q3}, [%0]! \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- MEMACCESS(1)
- "vst1.8 {q3}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vmov q2, q1 \n" // load next 8 ARGB
+ "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
+// 4a: 3e04 subs r6, #4
+// 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]!
+// 50: ef64 21f4 vorr q9, q10, q10
+// 54: f942 038d vst2.32 {d16-d19}, [r2]!
+// 58: d1f5 bne.n 46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #1 \n"
- "vrshrn.u16 d2, q2, #1 \n"
- "vrshrn.u16 d3, q3, #1 \n"
- MEMACCESS(1)
- "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ asm volatile(
+ "1: \n"
+ "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vrhadd.u8 q0, q0, q1 \n" // rounding half add
+ "vrhadd.u8 q1, q2, q3 \n" // rounding half add
+ "vst2.32 {q0, q1}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
);
}
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- MEMACCESS(0)
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- MEMACCESS(1)
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
- MEMACCESS(1)
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- MEMACCESS(2)
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- "mov r12, %3, lsl #2 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.32 {d0[0]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d0[1]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d1[0]}, [%0], r12 \n"
- MEMACCESS(0)
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- MEMACCESS(1)
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(src_stepx) // %3
- : "memory", "cc", "r12", "q0"
- );
+ asm volatile(
+ "mov r12, %3, lsl #2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0");
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
- asm volatile (
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
- MEMACCESS(1)
- "vld1.8 {d1}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d2}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d3}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d4}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d5}, [%1], r12 \n"
- MEMACCESS(0)
- "vld1.8 {d6}, [%0], r12 \n"
- MEMACCESS(1)
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- MEMACCESS(2)
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"(src_stepx) // %4
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
- );
+ asm volatile(
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
}
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
+#define LOAD1_DATA32_LANE(dn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
"vld1.32 {" #dn "[" #n "]}, [%6] \n"
-// clang-format on
-void ScaleARGBCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
int tmp;
- const uint8* src_tmp = src_argb;
- asm volatile (
- "1: \n"
- LOAD1_DATA32_LANE(d0, 0)
- LOAD1_DATA32_LANE(d0, 1)
- LOAD1_DATA32_LANE(d1, 0)
- LOAD1_DATA32_LANE(d1, 1)
- LOAD1_DATA32_LANE(d2, 0)
- LOAD1_DATA32_LANE(d2, 1)
- LOAD1_DATA32_LANE(d3, 0)
- LOAD1_DATA32_LANE(d3, 1)
-
- MEMACCESS(0)
- "vst1.32 {q0, q1}, [%0]! \n" // store pixels
- "subs %2, %2, #8 \n" // 8 processed per loop
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width), // %2
- "+r"(x), // %3
- "+r"(dx), // %4
- "=&r"(tmp), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "q0", "q1"
- );
+ const uint8_t* src_tmp = src_argb;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(d0, 0)
+ LOAD1_DATA32_LANE(d0, 1)
+ LOAD1_DATA32_LANE(d1, 0)
+ LOAD1_DATA32_LANE(d1, 1)
+ LOAD1_DATA32_LANE(d2, 0)
+ LOAD1_DATA32_LANE(d2, 1)
+ LOAD1_DATA32_LANE(d3, 0)
+ LOAD1_DATA32_LANE(d3, 1)
+ // clang-format on
+ "vst1.32 {q0, q1}, [%0]! \n" // store pixels
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x), // %3
+ "+r"(dx), // %4
+ "=&r"(tmp), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "q0", "q1");
}
#undef LOAD1_DATA32_LANE
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-// clang-format on
+#define LOAD2_DATA32_LANE(dn1, dn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
-void ScaleARGBFilterCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
- const uint8* src_tmp = src_argb;
+ const uint8_t* src_tmp = src_argb;
asm volatile (
"vdup.32 q0, %3 \n" // x
"vdup.32 q1, %4 \n" // dx
@@ -1045,7 +931,6 @@
"vshrn.i16 d0, q11, #7 \n"
"vshrn.i16 d1, q12, #7 \n"
- MEMACCESS(0)
"vst1.32 {d0, d1}, [%0]! \n" // store pixels
"vadd.s32 q8, q8, q9 \n"
"subs %2, %2, #4 \n" // 4 processed per loop
diff --git a/files/source/scale_neon64.cc b/files/source/scale_neon64.cc
index 1ff5f2b..0a7b80c 100644
--- a/files/source/scale_neon64.cc
+++ b/files/source/scale_neon64.cc
@@ -21,610 +21,544 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr,
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- // load even pixels into v0, odd into v1
- MEMACCESS(0)
- "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1" // Clobber List
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
);
}
// Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
- "rshrn2 v0.16b, v1.8h, #1 \n"
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1" // Clobber List
+ asm volatile(
+ "1: \n"
+ // load even pixels into v0, odd into v1
+ "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1" // Clobber List
);
}
// Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc
- MEMACCESS(1)
- "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
- "subs %w3, %w3, #16 \n" // 16 processed per loop
- "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
- "uaddlp v1.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1
- "uadalp v1.8h, v3.16b \n"
- "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
- "rshrn2 v0.16b, v1.8h, #2 \n"
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #16 \n" // 16 processed per loop
+ "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
+ "uaddlp v1.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
+ "uadalp v1.8h, v3.16b \n"
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn2 v0.16b, v1.8h, #2 \n"
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
);
}
-void ScaleRowDown4_NEON(const uint8* src_ptr,
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- MEMACCESS(1)
- "st1 {v2.8b}, [%1], #8 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "st1 {v2.8b}, [%1], #8 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
}
-void ScaleRowDown4Box_NEON(const uint8* src_ptr,
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride;
- const uint8* src_ptr2 = src_ptr + src_stride * 2;
- const uint8* src_ptr3 = src_ptr + src_stride * 3;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
- MEMACCESS(3)
- "ld1 {v1.16b}, [%2], #16 \n"
- MEMACCESS(4)
- "ld1 {v2.16b}, [%3], #16 \n"
- MEMACCESS(5)
- "ld1 {v3.16b}, [%4], #16 \n"
- "subs %w5, %w5, #4 \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v0.8h, v1.16b \n"
- "uadalp v0.8h, v2.16b \n"
- "uadalp v0.8h, v3.16b \n"
- "addp v0.8h, v0.8h, v0.8h \n"
- "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
- MEMACCESS(1)
- "st1 {v0.s}[0], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(src_ptr1), // %2
- "+r"(src_ptr2), // %3
- "+r"(src_ptr3), // %4
- "+r"(dst_width) // %5
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
+ const uint8_t* src_ptr1 = src_ptr + src_stride;
+ const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "ld1 {v2.16b}, [%3], #16 \n"
+ "ld1 {v3.16b}, [%4], #16 \n"
+ "subs %w5, %w5, #4 \n"
+ "uaddlp v0.8h, v0.16b \n"
+ "uadalp v0.8h, v1.16b \n"
+ "uadalp v0.8h, v2.16b \n"
+ "uadalp v0.8h, v3.16b \n"
+ "addp v0.8h, v0.8h, v0.8h \n"
+ "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
+ "st1 {v0.s}[0], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_ptr1), // %2
+ "+r"(src_ptr2), // %3
+ "+r"(src_ptr3), // %4
+ "+r"(dst_width) // %5
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- "subs %w2, %w2, #24 \n"
- "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2
- MEMACCESS(1)
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
+ asm volatile(
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "subs %w2, %w2, #24 \n"
+ "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "v0", "v1", "v2", "v3", "memory", "cc");
}
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- MEMACCESS(3)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
- // filter src line 0 with src line 1
- // expand chars to shorts to allow for room
- // when adding lines together
- "ushll v16.8h, v4.8b, #0 \n"
- "ushll v17.8h, v5.8b, #0 \n"
- "ushll v18.8h, v6.8b, #0 \n"
- "ushll v19.8h, v7.8b, #0 \n"
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "ushll v16.8h, v4.8b, #0 \n"
+ "ushll v17.8h, v5.8b, #0 \n"
+ "ushll v18.8h, v6.8b, #0 \n"
+ "ushll v19.8h, v7.8b, #0 \n"
- // 3 * line_0 + line_1
- "umlal v16.8h, v0.8b, v20.8b \n"
- "umlal v17.8h, v1.8b, v20.8b \n"
- "umlal v18.8h, v2.8b, v20.8b \n"
- "umlal v19.8h, v3.8b, v20.8b \n"
+ // 3 * line_0 + line_1
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "umlal v17.8h, v1.8b, v20.8b \n"
+ "umlal v18.8h, v2.8b, v20.8b \n"
+ "umlal v19.8h, v3.8b, v20.8b \n"
- // (3 * line_0 + line_1) >> 2
- "uqrshrn v0.8b, v16.8h, #2 \n"
- "uqrshrn v1.8b, v17.8h, #2 \n"
- "uqrshrn v2.8b, v18.8h, #2 \n"
- "uqrshrn v3.8b, v19.8h, #2 \n"
+ // (3 * line_0 + line_1) >> 2
+ "uqrshrn v0.8b, v16.8h, #2 \n"
+ "uqrshrn v1.8b, v17.8h, #2 \n"
+ "uqrshrn v2.8b, v18.8h, #2 \n"
+ "uqrshrn v3.8b, v19.8h, #2 \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v16.8h, v1.8b, #0 \n"
- "umlal v16.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v16.8h, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v16.8h, v1.8b, #0 \n"
+ "umlal v16.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v16.8h, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v16.8h, v2.8b, #0 \n"
- "umlal v16.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v16.8h, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v16.8h, v2.8b, #0 \n"
+ "umlal v16.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v16.8h, #2 \n"
- MEMACCESS(1)
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
- "v20", "memory", "cc"
- );
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "memory", "cc");
}
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- asm volatile (
- "movi v20.8b, #3 \n"
- "add %3, %3, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
- MEMACCESS(3)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
- "subs %w2, %w2, #24 \n"
- // average src line 0 with src line 1
- "urhadd v0.8b, v0.8b, v4.8b \n"
- "urhadd v1.8b, v1.8b, v5.8b \n"
- "urhadd v2.8b, v2.8b, v6.8b \n"
- "urhadd v3.8b, v3.8b, v7.8b \n"
+ asm volatile(
+ "movi v20.8b, #3 \n"
+ "add %3, %3, %0 \n"
+ "1: \n"
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
+ "subs %w2, %w2, #24 \n"
+ // average src line 0 with src line 1
+ "urhadd v0.8b, v0.8b, v4.8b \n"
+ "urhadd v1.8b, v1.8b, v5.8b \n"
+ "urhadd v2.8b, v2.8b, v6.8b \n"
+ "urhadd v3.8b, v3.8b, v7.8b \n"
- // a0 = (src[0] * 3 + s[1] * 1) >> 2
- "ushll v4.8h, v1.8b, #0 \n"
- "umlal v4.8h, v0.8b, v20.8b \n"
- "uqrshrn v0.8b, v4.8h, #2 \n"
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "ushll v4.8h, v1.8b, #0 \n"
+ "umlal v4.8h, v0.8b, v20.8b \n"
+ "uqrshrn v0.8b, v4.8h, #2 \n"
- // a1 = (src[1] * 1 + s[2] * 1) >> 1
- "urhadd v1.8b, v1.8b, v2.8b \n"
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "urhadd v1.8b, v1.8b, v2.8b \n"
- // a2 = (src[2] * 1 + s[3] * 3) >> 2
- "ushll v4.8h, v2.8b, #0 \n"
- "umlal v4.8h, v3.8b, v20.8b \n"
- "uqrshrn v2.8b, v4.8h, #2 \n"
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "ushll v4.8h, v2.8b, #0 \n"
+ "umlal v4.8h, v3.8b, v20.8b \n"
+ "uqrshrn v2.8b, v4.8h, #2 \n"
- MEMACCESS(1)
- "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(src_stride) // %3
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
- );
+ "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
}
-static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
-static uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
- 34, 6, 22, 35, 0, 0, 0, 0};
-static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
- 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
-static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
- 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
+static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
+ 22, 24, 27, 30, 0, 0, 0, 0};
+static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
+ 34, 6, 22, 35, 0, 0, 0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18};
// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
- asm volatile (
- MEMACCESS(3)
- "ld1 {v3.16b}, [%3] \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
- "subs %w2, %w2, #12 \n"
- "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
- MEMACCESS(1)
- "st1 {v2.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v2.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(&kShuf38) // %3
- : "v0", "v1", "v2", "v3", "memory", "cc"
- );
+ asm volatile(
+ "ld1 {v3.16b}, [%3] \n"
+ "1: \n"
+ "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
+ "subs %w2, %w2, #12 \n"
+ "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
+ "st1 {v2.8b}, [%1], #8 \n"
+ "st1 {v2.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "v0", "v1", "v2", "v3", "memory", "cc");
}
// 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
- const uint8* src_ptr1 = src_ptr + src_stride * 2;
+ const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
- asm volatile (
- MEMACCESS(5)
- "ld1 {v29.8h}, [%5] \n"
- MEMACCESS(6)
- "ld1 {v30.16b}, [%6] \n"
- MEMACCESS(7)
- "ld1 {v31.8h}, [%7] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ asm volatile(
+ "ld1 {v29.8h}, [%5] \n"
+ "ld1 {v30.16b}, [%6] \n"
+ "ld1 {v31.8h}, [%7] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- MEMACCESS(3)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- MEMACCESS(4)
- "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
- "subs %w4, %w4, #12 \n"
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
+ "subs %w4, %w4, #12 \n"
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v20.8b, v0.8b, v1.8b \n"
- "trn2 v21.8b, v0.8b, v1.8b \n"
- "trn1 v22.8b, v4.8b, v5.8b \n"
- "trn2 v23.8b, v4.8b, v5.8b \n"
- "trn1 v24.8b, v16.8b, v17.8b \n"
- "trn2 v25.8b, v16.8b, v17.8b \n"
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v20.8b, v0.8b, v1.8b \n"
+ "trn2 v21.8b, v0.8b, v1.8b \n"
+ "trn1 v22.8b, v4.8b, v5.8b \n"
+ "trn2 v23.8b, v4.8b, v5.8b \n"
+ "trn1 v24.8b, v16.8b, v17.8b \n"
+ "trn2 v25.8b, v16.8b, v17.8b \n"
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
- "trn1 v16.8b, v18.8b, v19.8b \n"
- "trn2 v17.8b, v18.8b, v19.8b \n"
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
+ "trn1 v16.8b, v18.8b, v19.8b \n"
+ "trn2 v17.8b, v18.8b, v19.8b \n"
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v20.4h, v20.8b \n"
- "uaddlp v21.4h, v21.8b \n"
- "uaddlp v22.4h, v22.8b \n"
- "uaddlp v23.4h, v23.8b \n"
- "uaddlp v24.4h, v24.8b \n"
- "uaddlp v25.4h, v25.8b \n"
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v20.4h, v20.8b \n"
+ "uaddlp v21.4h, v21.8b \n"
+ "uaddlp v22.4h, v22.8b \n"
+ "uaddlp v23.4h, v23.8b \n"
+ "uaddlp v24.4h, v24.8b \n"
+ "uaddlp v25.4h, v25.8b \n"
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
- "uaddlp v17.4h, v17.8b \n"
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
- // combine source lines
- "add v20.4h, v20.4h, v22.4h \n"
- "add v21.4h, v21.4h, v23.4h \n"
- "add v20.4h, v20.4h, v24.4h \n"
- "add v21.4h, v21.4h, v25.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
- "add v2.4h, v2.4h, v17.4h \n"
+ // combine source lines
+ "add v20.4h, v20.4h, v22.4h \n"
+ "add v21.4h, v21.4h, v23.4h \n"
+ "add v20.4h, v20.4h, v24.4h \n"
+ "add v21.4h, v21.4h, v25.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
+ "add v2.4h, v2.4h, v17.4h \n"
- // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
- // + s[6 + st * 1] + s[7 + st * 1]
- // + s[6 + st * 2] + s[7 + st * 2]) / 6
- "sqrdmulh v2.8h, v2.8h, v29.8h \n"
- "xtn v2.8b, v2.8h \n"
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "sqrdmulh v2.8h, v2.8h, v29.8h \n"
+ "xtn v2.8b, v2.8h \n"
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- "ushll v16.8h, v16.8b, #0 \n"
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "ushll v16.8h, v16.8b, #0 \n"
+ "uaddl v0.8h, v0.8b, v4.8b \n"
- // combine source lines
- "add v0.8h, v0.8h, v16.8h \n"
+ // combine source lines
+ "add v0.8h, v0.8h, v16.8h \n"
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
- // 0+1+2, 3+4+5
- "add v20.8h, v20.8h, v0.8h \n"
- "add v21.8h, v21.8h, v4.8h \n"
+ // 0+1+2, 3+4+5
+ "add v20.8h, v20.8h, v0.8h \n"
+ "add v21.8h, v21.8h, v4.8h \n"
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v20.8h, v31.8h \n"
- "sqrdmulh v1.8h, v21.8h, v31.8h \n"
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v20.8h, v31.8h \n"
+ "sqrdmulh v1.8h, v21.8h, v31.8h \n"
- // Align for table lookup, vtbl requires registers to
- // be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+ // Align for table lookup, vtbl requires registers to be adjacent
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
- MEMACCESS(1)
- "st1 {v3.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_src_stride), // %2
- "+r"(src_ptr1), // %3
- "+r"(dst_width) // %4
- : "r"(&kMult38_Div6), // %5
- "r"(&kShuf38_2), // %6
- "r"(&kMult38_Div9) // %7
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
- "v30", "v31", "memory", "cc"
- );
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(dst_width) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+ "memory", "cc");
}
// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
// TODO(fbarchard): use src_stride directly for clang 3.5+.
ptrdiff_t tmp_src_stride = src_stride;
- asm volatile (
- MEMACCESS(4)
- "ld1 {v30.8h}, [%4] \n"
- MEMACCESS(5)
- "ld1 {v31.16b}, [%5] \n"
- "add %2, %2, %0 \n"
- "1: \n"
+ asm volatile(
+ "ld1 {v30.8h}, [%4] \n"
+ "ld1 {v31.16b}, [%5] \n"
+ "add %2, %2, %0 \n"
+ "1: \n"
- // 00 40 01 41 02 42 03 43
- // 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63
- // 30 70 31 71 32 72 33 73
- MEMACCESS(0)
- "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
- MEMACCESS(3)
- "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
- "subs %w3, %w3, #12 \n"
+ // 00 40 01 41 02 42 03 43
+ // 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63
+ // 30 70 31 71 32 72 33 73
+ "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
+ "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
+ "subs %w3, %w3, #12 \n"
- // Shuffle the input data around to get align the data
- // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
- // 00 10 01 11 02 12 03 13
- // 40 50 41 51 42 52 43 53
- "trn1 v16.8b, v0.8b, v1.8b \n"
- "trn2 v17.8b, v0.8b, v1.8b \n"
- "trn1 v18.8b, v4.8b, v5.8b \n"
- "trn2 v19.8b, v4.8b, v5.8b \n"
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // 00 10 01 11 02 12 03 13
+ // 40 50 41 51 42 52 43 53
+ "trn1 v16.8b, v0.8b, v1.8b \n"
+ "trn2 v17.8b, v0.8b, v1.8b \n"
+ "trn1 v18.8b, v4.8b, v5.8b \n"
+ "trn2 v19.8b, v4.8b, v5.8b \n"
- // 20 30 21 31 22 32 23 33
- // 60 70 61 71 62 72 63 73
- "trn1 v0.8b, v2.8b, v3.8b \n"
- "trn2 v1.8b, v2.8b, v3.8b \n"
- "trn1 v4.8b, v6.8b, v7.8b \n"
- "trn2 v5.8b, v6.8b, v7.8b \n"
+ // 20 30 21 31 22 32 23 33
+ // 60 70 61 71 62 72 63 73
+ "trn1 v0.8b, v2.8b, v3.8b \n"
+ "trn2 v1.8b, v2.8b, v3.8b \n"
+ "trn1 v4.8b, v6.8b, v7.8b \n"
+ "trn2 v5.8b, v6.8b, v7.8b \n"
- // 00+10 01+11 02+12 03+13
- // 40+50 41+51 42+52 43+53
- "uaddlp v16.4h, v16.8b \n"
- "uaddlp v17.4h, v17.8b \n"
- "uaddlp v18.4h, v18.8b \n"
- "uaddlp v19.4h, v19.8b \n"
+ // 00+10 01+11 02+12 03+13
+ // 40+50 41+51 42+52 43+53
+ "uaddlp v16.4h, v16.8b \n"
+ "uaddlp v17.4h, v17.8b \n"
+ "uaddlp v18.4h, v18.8b \n"
+ "uaddlp v19.4h, v19.8b \n"
- // 60+70 61+71 62+72 63+73
- "uaddlp v1.4h, v1.8b \n"
- "uaddlp v5.4h, v5.8b \n"
+ // 60+70 61+71 62+72 63+73
+ "uaddlp v1.4h, v1.8b \n"
+ "uaddlp v5.4h, v5.8b \n"
- // combine source lines
- "add v16.4h, v16.4h, v18.4h \n"
- "add v17.4h, v17.4h, v19.4h \n"
- "add v2.4h, v1.4h, v5.4h \n"
+ // combine source lines
+ "add v16.4h, v16.4h, v18.4h \n"
+ "add v17.4h, v17.4h, v19.4h \n"
+ "add v2.4h, v1.4h, v5.4h \n"
- // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
- "uqrshrn v2.8b, v2.8h, #2 \n"
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "uqrshrn v2.8b, v2.8h, #2 \n"
- // Shuffle 2,3 reg around so that 2 can be added to the
- // 0,1 reg and 3 can be added to the 4,5 reg. This
- // requires expanding from u8 to u16 as the 0,1 and 4,5
- // registers are already expanded. Then do transposes
- // to get aligned.
- // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
- // combine source lines
- "uaddl v0.8h, v0.8b, v4.8b \n"
+ // combine source lines
+ "uaddl v0.8h, v0.8b, v4.8b \n"
- // xx 20 xx 21 xx 22 xx 23
- // xx 30 xx 31 xx 32 xx 33
- "trn1 v1.8h, v0.8h, v0.8h \n"
- "trn2 v4.8h, v0.8h, v0.8h \n"
- "xtn v0.4h, v1.4s \n"
- "xtn v4.4h, v4.4s \n"
+ // xx 20 xx 21 xx 22 xx 23
+ // xx 30 xx 31 xx 32 xx 33
+ "trn1 v1.8h, v0.8h, v0.8h \n"
+ "trn2 v4.8h, v0.8h, v0.8h \n"
+ "xtn v0.4h, v1.4s \n"
+ "xtn v4.4h, v4.4s \n"
- // 0+1+2, 3+4+5
- "add v16.8h, v16.8h, v0.8h \n"
- "add v17.8h, v17.8h, v4.8h \n"
+ // 0+1+2, 3+4+5
+ "add v16.8h, v16.8h, v0.8h \n"
+ "add v17.8h, v17.8h, v4.8h \n"
- // Need to divide, but can't downshift as the the value
- // isn't a power of 2. So multiply by 65536 / n
- // and take the upper 16 bits.
- "sqrdmulh v0.8h, v16.8h, v30.8h \n"
- "sqrdmulh v1.8h, v17.8h, v30.8h \n"
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "sqrdmulh v0.8h, v16.8h, v30.8h \n"
+ "sqrdmulh v1.8h, v17.8h, v30.8h \n"
- // Align for table lookup, vtbl requires registers to
- // be adjacent
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
- "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+ "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
- MEMACCESS(1)
- "st1 {v3.8b}, [%1], #8 \n"
- MEMACCESS(1)
- "st1 {v3.s}[2], [%1], #4 \n"
- "b.gt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_src_stride), // %2
- "+r"(dst_width) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2) // %5
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
- "v18", "v19", "v30", "v31", "memory", "cc"
+ "st1 {v3.8b}, [%1], #8 \n"
+ "st1 {v3.s}[2], [%1], #4 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_src_stride), // %2
+ "+r"(dst_width) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19", "v30", "v31", "memory", "cc");
+}
+
+// Add a row of bytes to a row of shorts. Used for box filter.
+// Reads 16 bytes and accumulates to 16 shorts at a time.
+void ScaleAddRow_NEON(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
+ int src_width) {
+ asm volatile(
+ "1: \n"
+ "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
+ "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
+ "uaddw2 v2.8h, v2.8h, v0.16b \n" // add
+ "uaddw v1.8h, v1.8h, v0.8b \n"
+ "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
+ "subs %w2, %w2, #16 \n" // 16 processed per loop
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(src_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2" // Clobber List
);
}
-void ScaleAddRows_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint16* dst_ptr,
- int src_width,
- int src_height) {
- const uint8* src_tmp;
- asm volatile (
- "1: \n"
- "mov %0, %1 \n"
- "mov w12, %w5 \n"
- "eor v2.16b, v2.16b, v2.16b \n"
- "eor v3.16b, v3.16b, v3.16b \n"
- "2: \n"
- // load 16 pixels into q0
- MEMACCESS(0)
- "ld1 {v0.16b}, [%0], %3 \n"
- "uaddw2 v3.8h, v3.8h, v0.16b \n"
- "uaddw v2.8h, v2.8h, v0.8b \n"
- "subs w12, w12, #1 \n"
- "b.gt 2b \n"
- MEMACCESS(2)
- "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
- "add %1, %1, #16 \n"
- "subs %w4, %w4, #16 \n" // 16 processed per loop
- "b.gt 1b \n"
- : "=&r"(src_tmp), // %0
- "+r"(src_ptr), // %1
- "+r"(dst_ptr), // %2
- "+r"(src_stride), // %3
- "+r"(src_width), // %4
- "+r"(src_height) // %5
- :
- : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA8_LANE(n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
+#define LOAD2_DATA8_LANE(n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5 \n" \
+ "add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
-// clang-format on
// The NEON version mimics this formula (from row_common.cc):
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
-void ScaleFilterCols_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
- const uint8* src_tmp = src_ptr;
- int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64)x;
- int64 dx64 = (int64)dx;
+ const uint8_t* src_tmp = src_ptr;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@@ -661,7 +595,6 @@
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
- MEMACCESS(0)
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
@@ -669,7 +602,7 @@
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
- "+r"(dst_width64), // %2
+ "+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
@@ -683,357 +616,300 @@
#undef LOAD2_DATA8_LANE
// 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
- const uint8* src_ptr,
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y_fraction = 256 - source_y_fraction;
- asm volatile (
- "cmp %w4, #0 \n"
- "b.eq 100f \n"
- "add %2, %2, %1 \n"
- "cmp %w4, #64 \n"
- "b.eq 75f \n"
- "cmp %w4, #128 \n"
- "b.eq 50f \n"
- "cmp %w4, #192 \n"
- "b.eq 25f \n"
+ asm volatile(
+ "cmp %w4, #0 \n"
+ "b.eq 100f \n"
+ "add %2, %2, %1 \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
+ "cmp %w4, #128 \n"
+ "b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
- "dup v5.8b, %w4 \n"
- "dup v4.8b, %w5 \n"
- // General purpose row blend.
- "1: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "umull v6.8h, v0.8b, v4.8b \n"
- "umull2 v7.8h, v0.16b, v4.16b \n"
- "umlal v6.8h, v1.8b, v5.8b \n"
- "umlal2 v7.8h, v1.16b, v5.16b \n"
- "rshrn v0.8b, v6.8h, #8 \n"
- "rshrn2 v0.16b, v7.8h, #8 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 1b \n"
- "b 99f \n"
+ "dup v5.8b, %w4 \n"
+ "dup v4.8b, %w5 \n"
+ // General purpose row blend.
+ "1: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "umull v6.8h, v0.8b, v4.8b \n"
+ "umull2 v7.8h, v0.16b, v4.16b \n"
+ "umlal v6.8h, v1.8b, v5.8b \n"
+ "umlal2 v7.8h, v1.16b, v5.16b \n"
+ "rshrn v0.8b, v6.8h, #8 \n"
+ "rshrn2 v0.16b, v7.8h, #8 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 1b \n"
+ "b 99f \n"
- // Blend 25 / 75.
- "25: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 25b \n"
- "b 99f \n"
+ // Blend 25 / 75.
+ "25: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
- // Blend 50 / 50.
- "50: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v1.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 50b \n"
- "b 99f \n"
+ // Blend 50 / 50.
+ "50: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 50b \n"
+ "b 99f \n"
- // Blend 75 / 25.
- "75: \n"
- MEMACCESS(1)
- "ld1 {v1.16b}, [%1], #16 \n"
- MEMACCESS(2)
- "ld1 {v0.16b}, [%2], #16 \n"
- "subs %w3, %w3, #16 \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- "urhadd v0.16b, v0.16b, v1.16b \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 75b \n"
- "b 99f \n"
+ // Blend 75 / 25.
+ "75: \n"
+ "ld1 {v1.16b}, [%1], #16 \n"
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
- // Blend 100 / 0 - Copy row unchanged.
- "100: \n"
- MEMACCESS(1)
- "ld1 {v0.16b}, [%1], #16 \n"
- "subs %w3, %w3, #16 \n"
- MEMACCESS(0)
- "st1 {v0.16b}, [%0], #16 \n"
- "b.gt 100b \n"
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "ld1 {v0.16b}, [%1], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 100b \n"
- "99: \n"
- MEMACCESS(0)
- "st1 {v0.b}[15], [%0] \n"
- : "+r"(dst_ptr), // %0
- "+r"(src_ptr), // %1
- "+r"(src_stride), // %2
- "+r"(dst_width), // %3
- "+r"(source_y_fraction),// %4
- "+r"(y_fraction) // %5
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
- );
+ "99: \n"
+ "st1 {v0.b}[15], [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction), // %4
+ "+r"(y_fraction) // %5
+ :
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
}
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- // load even pixels into q0, odd into q1
- MEMACCESS (0)
- "ld2 {v0.4s, v1.4s}, [%0], #32 \n"
- MEMACCESS (0)
- "ld2 {v2.4s, v3.4s}, [%0], #32 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- MEMACCESS (1)
- "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
- MEMACCESS (1)
- "st1 {v3.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r" (src_ptr), // %0
- "+r" (dst), // %1
- "+r" (dst_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "mov v2.16b, v3.16b \n"
+ "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
);
}
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS (0)
- // load 8 ARGB pixels.
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack
- "rshrn v1.8b, v1.8h, #1 \n"
- "rshrn v2.8b, v2.8h, #1 \n"
- "rshrn v3.8b, v3.8h, #1 \n"
- MEMACCESS (1)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
+ asm volatile(
+ "1: \n"
+ // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+ "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+
+ "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
+ "urhadd v1.16b, v2.16b, v3.16b \n"
+ "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
);
}
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst,
+ uint8_t* dst,
int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS (0)
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels.
- "subs %w3, %w3, #8 \n" // 8 processed per loop.
- "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
- "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
- "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
- "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
- MEMACCESS (1)
- "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels.
- "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
- "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
- "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
- "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
- "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack
- "rshrn v1.8b, v1.8h, #2 \n"
- "rshrn v2.8b, v2.8h, #2 \n"
- "rshrn v3.8b, v3.8h, #2 \n"
- MEMACCESS (2)
- "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
- "b.gt 1b \n"
- : "+r" (src_ptr), // %0
- "+r" (src_stride), // %1
- "+r" (dst), // %2
- "+r" (dst_width) // %3
- :
- : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
- );
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
+ "subs %w3, %w3, #8 \n" // 8 processed per loop.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+ "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
+ "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
+ "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
+ "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
+ "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
+ "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
+ "rshrn v0.8b, v0.8h, #2 \n" // round and pack
+ "rshrn v1.8b, v1.8h, #2 \n"
+ "rshrn v2.8b, v2.8h, #2 \n"
+ "rshrn v3.8b, v3.8h, #2 \n"
+ "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.s}[0], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[1], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[2], [%0], %3 \n"
- MEMACCESS(0)
- "ld1 {v0.s}[3], [%0], %3 \n"
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
- MEMACCESS(1)
- "st1 {v0.16b}, [%1], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"((int64)(src_stepx * 4)) // %3
- : "memory", "cc", "v0"
- );
+ asm volatile(
+ "1: \n"
+ "ld1 {v0.s}[0], [%0], %3 \n"
+ "ld1 {v0.s}[1], [%0], %3 \n"
+ "ld1 {v0.s}[2], [%0], %3 \n"
+ "ld1 {v0.s}[3], [%0], %3 \n"
+ "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%1], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"((int64_t)(src_stepx * 4)) // %3
+ : "memory", "cc", "v0");
}
// Reads 4 pixels at a time.
// Alignment requirement: src_argb 4 byte aligned.
// TODO(Yang Zhang): Might be worth another optimization pass in future.
// It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
- asm volatile (
- "add %1, %1, %0 \n"
- "1: \n"
- MEMACCESS(0)
- "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1
- MEMACCESS(1)
- "ld1 {v1.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v2.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v3.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v4.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v5.8b}, [%1], %4 \n"
- MEMACCESS(0)
- "ld1 {v6.8b}, [%0], %4 \n"
- MEMACCESS(1)
- "ld1 {v7.8b}, [%1], %4 \n"
- "uaddl v0.8h, v0.8b, v1.8b \n"
- "uaddl v2.8h, v2.8b, v3.8b \n"
- "uaddl v4.8h, v4.8b, v5.8b \n"
- "uaddl v6.8h, v6.8b, v7.8b \n"
- "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
- "mov v0.d[1], v2.d[0] \n"
- "mov v2.d[0], v16.d[1] \n"
- "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
- "mov v4.d[1], v6.d[0] \n"
- "mov v6.d[0], v16.d[1] \n"
- "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
- "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
- "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
- "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
- "subs %w3, %w3, #4 \n" // 4 pixels per loop.
- MEMACCESS(2)
- "st1 {v0.16b}, [%2], #16 \n"
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"((int64)(src_stepx * 4)) // %4
- : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
- );
+ asm volatile(
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
+ "ld1 {v1.8b}, [%1], %4 \n"
+ "ld1 {v2.8b}, [%0], %4 \n"
+ "ld1 {v3.8b}, [%1], %4 \n"
+ "ld1 {v4.8b}, [%0], %4 \n"
+ "ld1 {v5.8b}, [%1], %4 \n"
+ "ld1 {v6.8b}, [%0], %4 \n"
+ "ld1 {v7.8b}, [%1], %4 \n"
+ "uaddl v0.8h, v0.8b, v1.8b \n"
+ "uaddl v2.8h, v2.8b, v3.8b \n"
+ "uaddl v4.8h, v4.8b, v5.8b \n"
+ "uaddl v6.8h, v6.8b, v7.8b \n"
+ "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
+ "mov v0.d[1], v2.d[0] \n"
+ "mov v2.d[0], v16.d[1] \n"
+ "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
+ "mov v4.d[1], v6.d[0] \n"
+ "mov v6.d[0], v16.d[1] \n"
+ "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
+ "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
+ "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
+ "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
+ "subs %w3, %w3, #4 \n" // 4 pixels per loop.
+ "st1 {v0.16b}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"((int64_t)(src_stepx * 4)) // %4
+ : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
- "ld1 {" #vn ".s}[" #n "], [%6] \n"
-// clang-format on
+#define LOAD1_DATA32_LANE(vn, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
+ "ld1 {" #vn ".s}[" #n "], [%6] \n"
-void ScaleARGBCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
- const uint8* src_tmp = src_argb;
- int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64)x;
- int64 dx64 = (int64)dx;
- int64 tmp64;
- asm volatile (
- "1: \n"
- LOAD1_DATA32_LANE(v0, 0)
- LOAD1_DATA32_LANE(v0, 1)
- LOAD1_DATA32_LANE(v0, 2)
- LOAD1_DATA32_LANE(v0, 3)
- LOAD1_DATA32_LANE(v1, 0)
- LOAD1_DATA32_LANE(v1, 1)
- LOAD1_DATA32_LANE(v1, 2)
- LOAD1_DATA32_LANE(v1, 3)
-
- MEMACCESS(0)
- "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
- "subs %w2, %w2, #8 \n" // 8 processed per loop
- "b.gt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+r"(dst_width64), // %2
- "+r"(x64), // %3
- "+r"(dx64), // %4
- "=&r"(tmp64), // %5
- "+r"(src_tmp) // %6
- :
- : "memory", "cc", "v0", "v1"
- );
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
+ int64_t tmp64;
+ asm volatile(
+ "1: \n"
+ // clang-format off
+ LOAD1_DATA32_LANE(v0, 0)
+ LOAD1_DATA32_LANE(v0, 1)
+ LOAD1_DATA32_LANE(v0, 2)
+ LOAD1_DATA32_LANE(v0, 3)
+ LOAD1_DATA32_LANE(v1, 0)
+ LOAD1_DATA32_LANE(v1, 1)
+ LOAD1_DATA32_LANE(v1, 2)
+ LOAD1_DATA32_LANE(v1, 3)
+ // clang-format on
+ "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
+ "subs %w2, %w2, #8 \n" // 8 processed per loop
+ "b.gt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width), // %2
+ "+r"(x64), // %3
+ "+r"(dx64), // %4
+ "=&r"(tmp64), // %5
+ "+r"(src_tmp) // %6
+ :
+ : "memory", "cc", "v0", "v1");
}
#undef LOAD1_DATA32_LANE
-// clang-format off
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n) \
- "lsr %5, %3, #16 \n" \
- "add %6, %1, %5, lsl #2 \n" \
- "add %3, %3, %4 \n" \
- MEMACCESS(6) \
+#define LOAD2_DATA32_LANE(vn1, vn2, n) \
+ "lsr %5, %3, #16 \n" \
+ "add %6, %1, %5, lsl #2 \n" \
+ "add %3, %3, %4 \n" \
"ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
-// clang-format on
-void ScaleARGBFilterCols_NEON(uint8* dst_argb,
- const uint8* src_argb,
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
- const uint8* src_tmp = src_argb;
- int64 dst_width64 = (int64)dst_width; // Work around ios 64 bit warning.
- int64 x64 = (int64)x;
- int64 dx64 = (int64)dx;
+ const uint8_t* src_tmp = src_argb;
+ int64_t x64 = (int64_t)x; // NOLINT
+ int64_t dx64 = (int64_t)dx; // NOLINT
asm volatile (
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
@@ -1070,14 +946,13 @@
"shrn v0.8b, v16.8h, #7 \n"
"shrn2 v0.16b, v17.8h, #7 \n"
- MEMACCESS(0)
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
- "+r"(dst_width64), // %2
+ "+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
@@ -1090,6 +965,85 @@
#undef LOAD2_DATA32_LANE
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ // change the stride to row 2 pointer
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "1: \n"
+ "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
+ "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
+ "subs %w3, %w3, #8 \n" // 8 processed per loop
+ "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
+ "uaddlp v1.4s, v1.8h \n"
+ "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
+ "uadalp v1.4s, v3.8h \n"
+ "rshrn v0.4h, v0.4s, #2 \n" // round and pack
+ "rshrn2 v0.8h, v1.4s, #2 \n"
+ "st1 {v0.8h}, [%2], #16 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "v0", "v1", "v2", "v3" // Clobber List
+ );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width) {
+ asm volatile(
+ "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
+ "movi v0.8h, #9 \n" // constants
+ "movi v1.4s, #3 \n"
+
+ "1: \n"
+ "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
+ "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
+ "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
+ "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
+ "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
+ "umull v16.4s, v3.4h, v0.4h \n"
+ "umull2 v7.4s, v3.8h, v0.8h \n"
+ "umull v18.4s, v4.4h, v0.4h \n"
+ "umull2 v17.4s, v4.8h, v0.8h \n"
+ "uaddw v16.4s, v16.4s, v6.4h \n"
+ "uaddl2 v19.4s, v6.8h, v3.8h \n"
+ "uaddl v3.4s, v6.4h, v3.4h \n"
+ "uaddw2 v6.4s, v7.4s, v6.8h \n"
+ "uaddl2 v7.4s, v5.8h, v4.8h \n"
+ "uaddl v4.4s, v5.4h, v4.4h \n"
+ "uaddw v18.4s, v18.4s, v5.4h \n"
+ "mla v16.4s, v4.4s, v1.4s \n"
+ "mla v18.4s, v3.4s, v1.4s \n"
+ "mla v6.4s, v7.4s, v1.4s \n"
+ "uaddw2 v4.4s, v17.4s, v5.8h \n"
+ "uqrshrn v16.4h, v16.4s, #4 \n"
+ "mla v4.4s, v19.4s, v1.4s \n"
+ "uqrshrn2 v16.8h, v6.4s, #4 \n"
+ "uqrshrn v17.4h, v18.4s, #4 \n"
+ "uqrshrn2 v17.8h, v4.4s, #4 \n"
+ "st2 {v16.8h-v17.8h}, [%2], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ : "r"(2LL), // %4
+ "r"(14LL) // %5
+ : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+ "v19" // Clobber List
+ );
+}
+
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus
diff --git a/files/source/scale_win.cc b/files/source/scale_win.cc
index 0c5b3a1..c5fc86f 100644
--- a/files/source/scale_win.cc
+++ b/files/source/scale_win.cc
@@ -17,80 +17,81 @@
#endif
// This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
// Offsets for source bytes 0 to 9
-static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
-static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
+static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
+ 8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
- 10, 11, 12, 13, 13, 14, 14, 15};
+static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
+ 10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
-static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
-static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
-static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
- 6, 8, 11, 14, 128, 128, 128, 128};
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
+ 6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
- 128, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
- 6, 7, 12, 13, 128, 128, 128, 128};
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
+ 6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
- 65536 / 9, 65536 / 6, 0, 0};
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+ 65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
- 11, 128, 14, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
+ 11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
- 12, 128, 15, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
+ 12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
- 13, 128, 128, 128, 128, 128, 128, 128};
+static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
+ 13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
- 65536 / 3, 65536 / 2, 0, 0};
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+ 65536 / 3, 65536 / 2, 0, 0};
// Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -115,9 +116,9 @@
}
// Blends 32x1 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -149,9 +150,9 @@
}
// Blends 32x2 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -194,9 +195,9 @@
#ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -223,9 +224,9 @@
}
// Blends 64x1 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -261,9 +262,9 @@
// For rounding, average = (sum + 2) / 4
// becomes average((sum >> 1), 0)
// Blends 64x2 rectangle to 32x1.
-__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -308,9 +309,9 @@
#endif // HAS_SCALEROWDOWN2_AVX2
// Point samples 32 pixels to 8 pixels.
-__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -340,9 +341,9 @@
}
// Blends 32x4 rectangle to 8x1.
-__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -399,9 +400,9 @@
#ifdef HAS_SCALEROWDOWN4_AVX2
// Point samples 64 pixels to 16 pixels.
-__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -434,9 +435,9 @@
}
// Blends 64x4 rectangle to 16x1.
-__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -498,9 +499,9 @@
// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
// Then shuffled to do the scaling.
-__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -546,9 +547,9 @@
// xmm7 kRound34
// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -603,9 +604,9 @@
}
// Note that movdqa+palign may be better than movdqu.
-__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -665,9 +666,9 @@
// 3/8 point sampler
// Scale 32 pixels to 12
-__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -697,9 +698,9 @@
}
// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -762,9 +763,9 @@
}
// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
- uint8* dst_ptr,
+ uint8_t* dst_ptr,
int dst_width) {
__asm {
push esi
@@ -807,8 +808,8 @@
}
// Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
- uint16* dst_ptr,
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
int src_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -816,7 +817,7 @@
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
- // sum rows
+ // sum rows
xloop:
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
@@ -838,8 +839,8 @@
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
- uint16* dst_ptr,
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+ uint16_t* dst_ptr,
int src_width) {
__asm {
mov eax, [esp + 4] // src_ptr
@@ -847,7 +848,7 @@
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
- // sum rows
+ // sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
@@ -870,16 +871,16 @@
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
-static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
- 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
- 0x4040, 0x4040, 0x4040, 0x4040};
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+ 0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
-__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -939,7 +940,7 @@
add ecx, 2 - 1
jl xloop99
- // 1 pixel remainder
+ // 1 pixel remainder
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
@@ -964,8 +965,8 @@
}
// Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
- const uint8* src_ptr,
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+ const uint8_t* src_ptr,
int dst_width,
int x,
int dx) {
@@ -991,9 +992,9 @@
}
// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1016,9 +1017,9 @@
}
// Blends 8x1 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
__asm {
mov eax, [esp + 4] // src_argb
@@ -1044,9 +1045,9 @@
}
// Blends 8x2 rectangle to 4x1.
-__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
__asm {
push esi
@@ -1078,10 +1079,10 @@
}
// Reads 4 pixels at a time.
-__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
__asm {
push ebx
@@ -1115,10 +1116,10 @@
}
// Blends four 2x2 to 4x1.
-__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
int src_stepx,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_width) {
__asm {
push ebx
@@ -1163,8 +1164,8 @@
}
// Column scaling unfiltered. SSE2 version.
-__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
@@ -1194,7 +1195,7 @@
sub ecx, 4
jl xloop49
- // 4 Pixel loop.
+ // 4 Pixel loop.
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
@@ -1218,7 +1219,7 @@
test ecx, 2
je xloop29
- // 2 Pixels.
+ // 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
@@ -1231,7 +1232,7 @@
test ecx, 1
je xloop99
- // 1 Pixels.
+ // 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
xloop99:
@@ -1246,18 +1247,18 @@
// TODO(fbarchard): Port to Neon
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
+static const uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
+static const uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
-__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
- const uint8* src_argb,
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
@@ -1309,7 +1310,7 @@
add ecx, 2 - 1
jl xloop99
- // 1 pixel remainder
+ // 1 pixel remainder
psrlw xmm2, 9 // 7 bit fractions.
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
pshufb xmm2, xmm5 // 00000000
@@ -1329,8 +1330,8 @@
}
// Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
- const uint8* src_argb,
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+ const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
diff --git a/files/source/video_common.cc b/files/source/video_common.cc
index 3e9c6a2..92384c0 100644
--- a/files/source/video_common.cc
+++ b/files/source/video_common.cc
@@ -15,14 +15,13 @@
extern "C" {
#endif
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
struct FourCCAliasEntry {
- uint32 alias;
- uint32 canonical;
+ uint32_t alias;
+ uint32_t canonical;
};
-static const struct FourCCAliasEntry kFourCCAliases[] = {
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
{FOURCC_IYUV, FOURCC_I420},
{FOURCC_YU12, FOURCC_I420},
{FOURCC_YU16, FOURCC_I422},
@@ -46,9 +45,9 @@
// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
+uint32_t CanonicalFourCC(uint32_t fourcc) {
int i;
- for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+ for (i = 0; i < NUM_ALIASES; ++i) {
if (kFourCCAliases[i].alias == fourcc) {
return kFourCCAliases[i].canonical;
}
diff --git a/files/tools_libyuv/OWNERS b/files/tools_libyuv/OWNERS
deleted file mode 100644
index aca046d..0000000
--- a/files/tools_libyuv/OWNERS
+++ /dev/null
@@ -1 +0,0 @@
-kjellander@chromium.org
diff --git a/files/tools_libyuv/autoroller/roll_deps.py b/files/tools_libyuv/autoroller/roll_deps.py
index a9eb307..8359d30 100755
--- a/files/tools_libyuv/autoroller/roll_deps.py
+++ b/files/tools_libyuv/autoroller/roll_deps.py
@@ -8,7 +8,7 @@
# be found in the AUTHORS file in the root of the source tree.
# This is a modified copy of the script in
-# https://chromium.googlesource.com/external/webrtc/+/master/tools-webrtc/autoroller/roll_deps.py
+# https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py
# customized for libyuv.
@@ -22,7 +22,7 @@
import re
import subprocess
import sys
-import urllib
+import urllib2
# Skip these dependencies (list without solution name prefix).
@@ -48,7 +48,6 @@
sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build'))
import find_depot_tools
find_depot_tools.add_depot_tools_to_path()
-from gclient import GClientKeywords
CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py'
CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools',
@@ -62,12 +61,14 @@
pass
+def VarLookup(local_scope):
+ return lambda var_name: local_scope['vars'][var_name]
+
+
def ParseDepsDict(deps_content):
local_scope = {}
- var = GClientKeywords.VarImpl({}, local_scope)
global_scope = {
- 'From': GClientKeywords.FromImpl,
- 'Var': var.Lookup,
+ 'Var': VarLookup(local_scope),
'deps_os': {},
}
exec(deps_content, global_scope, local_scope)
@@ -89,7 +90,7 @@
for line in reversed(commit_message.splitlines()):
m = COMMIT_POSITION_RE.match(line.strip())
if m:
- return m.group(1)
+ return int(m.group(1))
logging.error('Failed to parse commit position id from:\n%s\n',
commit_message)
sys.exit(-1)
@@ -108,7 +109,7 @@
logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir)
env = os.environ.copy()
if extra_env:
- assert all(type(value) == str for value in extra_env.values())
+ assert all(isinstance(value, str) for value in extra_env.values())
logging.debug('extra env: %s', extra_env)
env.update(extra_env)
p = subprocess.Popen(command, stdout=subprocess.PIPE,
@@ -168,7 +169,7 @@
def ReadUrlContent(url):
"""Connect to a remote host and read the contents. Returns a list of lines."""
- conn = urllib.urlopen(url)
+ conn = urllib2.urlopen(url)
try:
return conn.readlines()
except IOError as e:
@@ -206,7 +207,15 @@
"""Builds a dict of paths to DepsEntry objects from a raw parsed deps dict."""
result = {}
def AddDepsEntries(deps_subdict):
- for path, deps_url in deps_subdict.iteritems():
+ for path, deps_url_spec in deps_subdict.iteritems():
+ # The deps url is either an URL and a condition, or just the URL.
+ if isinstance(deps_url_spec, dict):
+ if deps_url_spec.get('dep_type') == 'cipd':
+ continue
+ deps_url = deps_url_spec['url']
+ else:
+ deps_url = deps_url_spec
+
if not result.has_key(path):
url, revision = deps_url.split('@') if deps_url else (None, None)
result[path] = DepsEntry(path, url, revision)
@@ -289,9 +298,6 @@
commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval))
commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE %
rev_interval))
- # TBR field will be empty unless in some custom cases, where some engineers
- # are added.
- tbr_authors = ''
if changed_deps_list:
commit_msg.append('Changed dependencies:')
@@ -313,7 +319,11 @@
else:
commit_msg.append('No update to Clang.\n')
- commit_msg.append('TBR=%s' % tbr_authors)
+ # TBR needs to be non-empty for Gerrit to process it.
+ git_author = _RunCommand(['git', 'config', 'user.email'],
+ working_dir=CHECKOUT_SRC_DIR)[0].strip()
+ commit_msg.append('TBR=%s' % git_author)
+
commit_msg.append('BUG=None')
return '\n'.join(commit_msg)
@@ -334,17 +344,13 @@
local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path)
if not os.path.isdir(local_dep_dir):
raise RollError(
- 'Cannot find local directory %s. Either run\n'
- 'gclient sync --deps=all\n'
- 'or make sure the .gclient file for your solution contains all '
- 'platforms in the target_os list, i.e.\n'
+ 'Cannot find local directory %s. Make sure the .gclient file\n'
+ 'contains all platforms in the target_os list, i.e.\n'
'target_os = ["android", "unix", "mac", "ios", "win"];\n'
'Then run "gclient sync" again.' % local_dep_dir)
- _, stderr = _RunCommand(
- ['roll-dep-svn', '--no-verify-revision', dep.path, dep.new_rev],
- working_dir=CHECKOUT_SRC_DIR, ignore_exit_code=True)
- if stderr:
- logging.warning('roll-dep-svn: %s', stderr)
+ _RunCommand(
+ ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)],
+ working_dir=CHECKOUT_SRC_DIR)
def _IsTreeClean():
@@ -392,20 +398,36 @@
_RunCommand(['git', 'commit', '-m', commit_msg])
-def _UploadCL(dry_run, rietveld_email=None):
- logging.info('Uploading CL...')
- if not dry_run:
- cmd = ['git', 'cl', 'upload', '-f']
- if rietveld_email:
- cmd.append('--email=%s' % rietveld_email)
- _RunCommand(cmd, extra_env={'EDITOR': 'true'})
+def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos):
+ if skip_cq:
+ return 0
+ if (new_commit_pos - current_commit_pos) < cq_over:
+ return 1
+ return 2
-def _SendToCQ(dry_run, skip_cq):
- logging.info('Sending the CL to the CQ...')
- if not dry_run and not skip_cq:
- _RunCommand(['git', 'cl', 'set_commit'])
- logging.info('Sent the CL to the CQ.')
+def _UploadCL(commit_queue_mode):
+ """Upload the committed changes as a changelist to Gerrit.
+
+ commit_queue_mode:
+ - 2: Submit to commit queue.
+ - 1: Run trybots but do not submit to CQ.
+ - 0: Skip CQ, upload only.
+ """
+ cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail']
+ if commit_queue_mode >= 2:
+ logging.info('Sending the CL to the CQ...')
+ cmd.extend(['--use-commit-queue'])
+ elif commit_queue_mode >= 1:
+ logging.info('Starting CQ dry run...')
+ cmd.extend(['--cq-dry-run'])
+ extra_env = {
+ 'EDITOR': 'true',
+ 'SKIP_GCE_AUTH_FOR_GIT': '1',
+ }
+ stdout, stderr = _RunCommand(cmd, extra_env=extra_env)
+ logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s',
+ stdout, stderr)
def main():
@@ -415,10 +437,6 @@
p.add_argument('-r', '--revision',
help=('Chromium Git revision to roll to. Defaults to the '
'Chromium HEAD revision if omitted.'))
- p.add_argument('-u', '--rietveld-email',
- help=('E-mail address to use for creating the CL at Rietveld'
- 'If omitted a previously cached one will be used or an '
- 'error will be thrown during upload.'))
p.add_argument('--dry-run', action='store_true', default=False,
help=('Calculate changes and modify DEPS, but don\'t create '
'any local branch, commit, upload CL or send any '
@@ -427,8 +445,12 @@
default=False,
help=('Ignore if the current branch is not master or if there '
'are uncommitted changes (default: %(default)s).'))
- p.add_argument('--skip-cq', action='store_true', default=False,
- help='Skip sending the CL to the CQ (default: %(default)s)')
+ grp = p.add_mutually_exclusive_group()
+ grp.add_argument('--skip-cq', action='store_true', default=False,
+ help='Skip sending the CL to the CQ (default: %(default)s)')
+ grp.add_argument('--cq-over', type=int, default=1,
+ help=('Commit queue dry run if the revision difference '
+ 'is below this number (default: %(default)s)'))
p.add_argument('-v', '--verbose', action='store_true', default=False,
help='Be extra verbose in printing of log messages.')
opts = p.parse_args()
@@ -473,8 +495,11 @@
_CreateRollBranch(opts.dry_run)
UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps)
_LocalCommit(commit_msg, opts.dry_run)
- _UploadCL(opts.dry_run, opts.rietveld_email)
- _SendToCQ(opts.dry_run, opts.skip_cq)
+ commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over,
+ current_commit_pos, new_commit_pos)
+ logging.info('Uploading CL...')
+ if not opts.dry_run:
+ _UploadCL(commit_queue_mode)
return 0
diff --git a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
index 025e46e..477b6e4 100755
--- a/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
+++ b/files/tools_libyuv/autoroller/unittests/roll_deps_test.py
@@ -82,6 +82,11 @@
self.assertEqual(self.fake.expectations, [])
setattr(roll_deps, '_RunCommand', self.old_RunCommand)
+ def testVarLookup(self):
+ local_scope = {'foo': 'wrong', 'vars': {'foo': 'bar'}}
+ lookup = roll_deps.VarLookup(local_scope)
+ self.assertEquals(lookup('foo'), 'bar')
+
def testUpdateDepsFile(self):
new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111'
diff --git a/files/tools_libyuv/get_landmines.py b/files/tools_libyuv/get_landmines.py
index 3dc78bb..c554f04 100755
--- a/files/tools_libyuv/get_landmines.py
+++ b/files/tools_libyuv/get_landmines.py
@@ -12,20 +12,8 @@
(or a list of 'landmines').
"""
-import os
import sys
-script_dir = os.path.dirname(os.path.realpath(__file__))
-checkout_root = os.path.abspath(os.path.join(script_dir, os.pardir))
-sys.path.insert(0, os.path.join(checkout_root, 'build'))
-import landmine_utils
-
-
-distributor = landmine_utils.distributor
-gyp_defines = landmine_utils.gyp_defines
-gyp_msvs_version = landmine_utils.gyp_msvs_version
-platform = landmine_utils.platform
-
def print_landmines():
"""
diff --git a/files/tools_libyuv/msan/OWNERS b/files/tools_libyuv/msan/OWNERS
deleted file mode 100644
index 60351e7..0000000
--- a/files/tools_libyuv/msan/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-pbos@chromium.org
-kjellander@chromium.org
-
diff --git a/files/tools_libyuv/ubsan/OWNERS b/files/tools_libyuv/ubsan/OWNERS
deleted file mode 100644
index b608519..0000000
--- a/files/tools_libyuv/ubsan/OWNERS
+++ /dev/null
@@ -1,4 +0,0 @@
-pbos@webrtc.org
-kjellander@webrtc.org
-fbarchard@chromium.org
-
diff --git a/files/tools_libyuv/ubsan/vptr_blacklist.txt b/files/tools_libyuv/ubsan/vptr_blacklist.txt
index 8ed070c..23cfca5 100644
--- a/files/tools_libyuv/ubsan/vptr_blacklist.txt
+++ b/files/tools_libyuv/ubsan/vptr_blacklist.txt
@@ -19,3 +19,7 @@
# Example:
# src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc
+#############################################################################
+# UBsan goes into an infinite recursion when __dynamic_cast instrumented with
+# "vptr". See crbug.com/609786.
+src:*/third_party/libc\+\+abi/trunk/src/private_typeinfo.cpp
diff --git a/files/tools_libyuv/valgrind/chrome_tests.bat b/files/tools_libyuv/valgrind/chrome_tests.bat
new file mode 100755
index 0000000..9d4c8ca
--- /dev/null
+++ b/files/tools_libyuv/valgrind/chrome_tests.bat
@@ -0,0 +1,53 @@
+@echo off
+:: Copyright (c) 2011 The Chromium Authors. All rights reserved.
+:: Use of this source code is governed by a BSD-style license that can be
+:: found in the LICENSE file.
+
+setlocal
+
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (e.g. drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+echo "Unknown tool: `%TOOL_NAME%`! Only drmemory is supported right now"
+exit /B 1
+
+:SETUP_DRMEMORY
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%../python/google
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%/chrome_tests.py %*
diff --git a/files/tools_libyuv/valgrind/chrome_tests.py b/files/tools_libyuv/valgrind/chrome_tests.py
new file mode 100755
index 0000000..fe899bc
--- /dev/null
+++ b/files/tools_libyuv/valgrind/chrome_tests.py
@@ -0,0 +1,869 @@
+#!/usr/bin/env python
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+''' Runs various chrome tests through valgrind_test.py.'''
+
+import glob
+import logging
+import multiprocessing
+import optparse
+import os
+import stat
+import subprocess
+import sys
+
+import logging_utils
+import path_utils
+
+import common
+import valgrind_test
+
+class TestNotFound(Exception): pass
+
+class MultipleGTestFiltersSpecified(Exception): pass
+
+class BuildDirNotFound(Exception): pass
+
+class BuildDirAmbiguous(Exception): pass
+
+class ExecutableNotFound(Exception): pass
+
+class BadBinary(Exception): pass
+
+class ChromeTests:
+ SLOW_TOOLS = ["memcheck", "drmemory"]
+ LAYOUT_TESTS_DEFAULT_CHUNK_SIZE = 300
+
+ def __init__(self, options, args, test):
+ if ':' in test:
+ (self._test, self._gtest_filter) = test.split(':', 1)
+ else:
+ self._test = test
+ self._gtest_filter = options.gtest_filter
+
+ if self._test not in self._test_list:
+ raise TestNotFound("Unknown test: %s" % test)
+
+ if options.gtest_filter and options.gtest_filter != self._gtest_filter:
+ raise MultipleGTestFiltersSpecified("Can not specify both --gtest_filter "
+ "and --test %s" % test)
+
+ self._options = options
+ self._args = args
+
+ script_dir = path_utils.ScriptDir()
+ # Compute the top of the tree (the "source dir") from the script dir (where
+ # this script lives). We assume that the script dir is in tools/valgrind/
+ # relative to the top of the tree.
+ self._source_dir = os.path.dirname(os.path.dirname(script_dir))
+ # since this path is used for string matching, make sure it's always
+ # an absolute Unix-style path
+ self._source_dir = os.path.abspath(self._source_dir).replace('\\', '/')
+ valgrind_test_script = os.path.join(script_dir, "valgrind_test.py")
+ self._command_preamble = ["--source-dir=%s" % (self._source_dir)]
+
+ if not self._options.build_dir:
+ dirs = [
+ os.path.join(self._source_dir, "xcodebuild", "Debug"),
+ os.path.join(self._source_dir, "out", "Debug"),
+ os.path.join(self._source_dir, "build", "Debug"),
+ ]
+ build_dir = [d for d in dirs if os.path.isdir(d)]
+ if len(build_dir) > 1:
+ raise BuildDirAmbiguous("Found more than one suitable build dir:\n"
+ "%s\nPlease specify just one "
+ "using --build-dir" % ", ".join(build_dir))
+ elif build_dir:
+ self._options.build_dir = build_dir[0]
+ else:
+ self._options.build_dir = None
+
+ if self._options.build_dir:
+ build_dir = os.path.abspath(self._options.build_dir)
+ self._command_preamble += ["--build-dir=%s" % (self._options.build_dir)]
+
+ def _EnsureBuildDirFound(self):
+ if not self._options.build_dir:
+ raise BuildDirNotFound("Oops, couldn't find a build dir, please "
+ "specify it manually using --build-dir")
+
+ def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
+ '''Generates the default command array that most tests will use.'''
+ if exe and common.IsWindows():
+ exe += '.exe'
+
+ cmd = list(self._command_preamble)
+
+ # Find all suppressions matching the following pattern:
+ # tools/valgrind/TOOL/suppressions[_PLATFORM].txt
+ # and list them with --suppressions= prefix.
+ script_dir = path_utils.ScriptDir()
+ tool_name = tool.ToolName();
+ suppression_file = os.path.join(script_dir, tool_name, "suppressions.txt")
+ if os.path.exists(suppression_file):
+ cmd.append("--suppressions=%s" % suppression_file)
+ # Platform-specific suppression
+ for platform in common.PlatformNames():
+ platform_suppression_file = \
+ os.path.join(script_dir, tool_name, 'suppressions_%s.txt' % platform)
+ if os.path.exists(platform_suppression_file):
+ cmd.append("--suppressions=%s" % platform_suppression_file)
+
+ if tool_name == "drmemory":
+ if self._options.drmemory_ops:
+ # prepending " " to avoid Dr. Memory's option confusing optparse
+ cmd += ["--drmemory_ops", " " + self._options.drmemory_ops]
+
+ if self._options.valgrind_tool_flags:
+ cmd += self._options.valgrind_tool_flags.split(" ")
+ if self._options.keep_logs:
+ cmd += ["--keep_logs"]
+ if valgrind_test_args != None:
+ for arg in valgrind_test_args:
+ cmd.append(arg)
+ if exe:
+ self._EnsureBuildDirFound()
+ exe_path = os.path.join(self._options.build_dir, exe)
+ if not os.path.exists(exe_path):
+ raise ExecutableNotFound("Couldn't find '%s'" % exe_path)
+
+ # Make sure we don't try to test ASan-built binaries
+ # with other dynamic instrumentation-based tools.
+ # TODO(timurrrr): also check TSan and MSan?
+ # `nm` might not be available, so use try-except.
+ try:
+ # Do not perform this check on OS X, as 'nm' on 10.6 can't handle
+ # binaries built with Clang 3.5+.
+ if not common.IsMac():
+ nm_output = subprocess.check_output(["nm", exe_path])
+ if nm_output.find("__asan_init") != -1:
+ raise BadBinary("You're trying to run an executable instrumented "
+ "with AddressSanitizer under %s. Please provide "
+ "an uninstrumented executable." % tool_name)
+ except OSError:
+ pass
+
+ cmd.append(exe_path)
+ # Valgrind runs tests slowly, so slow tests hurt more; show elapased time
+ # so we can find the slowpokes.
+ cmd.append("--gtest_print_time")
+ # Built-in test launcher for gtest-based executables runs tests using
+ # multiple process by default. Force the single-process mode back.
+ cmd.append("--single-process-tests")
+ if self._options.gtest_repeat:
+ cmd.append("--gtest_repeat=%s" % self._options.gtest_repeat)
+ if self._options.gtest_shuffle:
+ cmd.append("--gtest_shuffle")
+ if self._options.gtest_break_on_failure:
+ cmd.append("--gtest_break_on_failure")
+ if self._options.test_launcher_bot_mode:
+ cmd.append("--test-launcher-bot-mode")
+ if self._options.test_launcher_total_shards is not None:
+ cmd.append("--test-launcher-total-shards=%d"
+ % self._options.test_launcher_total_shards)
+ if self._options.test_launcher_shard_index is not None:
+ cmd.append("--test-launcher-shard-index=%d"
+ % self._options.test_launcher_shard_index)
+ return cmd
+
+ def Run(self):
+ ''' Runs the test specified by command-line argument --test '''
+ logging.info("running test %s" % (self._test))
+ return self._test_list[self._test](self)
+
+ def _AppendGtestFilter(self, tool, name, cmd):
+ '''Append an appropriate --gtest_filter flag to the googletest binary
+ invocation.
+ If the user passed their own filter mentioning only one test, just use
+ it. Otherwise, filter out tests listed in the appropriate gtest_exclude
+ files.
+ '''
+ if (self._gtest_filter and
+ ":" not in self._gtest_filter and
+ "?" not in self._gtest_filter and
+ "*" not in self._gtest_filter):
+ cmd.append("--gtest_filter=%s" % self._gtest_filter)
+ return
+
+ filters = []
+ gtest_files_dir = os.path.join(path_utils.ScriptDir(), "gtest_exclude")
+
+ gtest_filter_files = [
+ os.path.join(gtest_files_dir, name + ".gtest-%s.txt" % tool.ToolName())]
+ # Use ".gtest.txt" files only for slow tools, as they now contain
+ # Valgrind- and Dr.Memory-specific filters.
+ # TODO(glider): rename the files to ".gtest_slow.txt"
+ if tool.ToolName() in ChromeTests.SLOW_TOOLS:
+ gtest_filter_files += [os.path.join(gtest_files_dir, name + ".gtest.txt")]
+ for platform_suffix in common.PlatformNames():
+ gtest_filter_files += [
+ os.path.join(gtest_files_dir, name + ".gtest_%s.txt" % platform_suffix),
+ os.path.join(gtest_files_dir, name + ".gtest-%s_%s.txt" % \
+ (tool.ToolName(), platform_suffix))]
+ logging.info("Reading gtest exclude filter files:")
+ for filename in gtest_filter_files:
+ # strip the leading absolute path (may be very long on the bot)
+ # and the following / or \.
+ readable_filename = filename.replace("\\", "/") # '\' on Windows
+ readable_filename = readable_filename.replace(self._source_dir, "")[1:]
+ if not os.path.exists(filename):
+ logging.info(" \"%s\" - not found" % readable_filename)
+ continue
+ logging.info(" \"%s\" - OK" % readable_filename)
+ f = open(filename, 'r')
+ for line in f.readlines():
+ if line.startswith("#") or line.startswith("//") or line.isspace():
+ continue
+ line = line.rstrip()
+ test_prefixes = ["FLAKY", "FAILS"]
+ for p in test_prefixes:
+ # Strip prefixes from the test names.
+ line = line.replace(".%s_" % p, ".")
+ # Exclude the original test name.
+ filters.append(line)
+ if line[-2:] != ".*":
+ # List all possible prefixes if line doesn't end with ".*".
+ for p in test_prefixes:
+ filters.append(line.replace(".", ".%s_" % p))
+ # Get rid of duplicates.
+ filters = set(filters)
+ gtest_filter = self._gtest_filter
+ if len(filters):
+ if gtest_filter:
+ gtest_filter += ":"
+ if gtest_filter.find("-") < 0:
+ gtest_filter += "-"
+ else:
+ gtest_filter = "-"
+ gtest_filter += ":".join(filters)
+ if gtest_filter:
+ cmd.append("--gtest_filter=%s" % gtest_filter)
+
+ @staticmethod
+ def ShowTests():
+ test_to_names = {}
+ for name, test_function in ChromeTests._test_list.iteritems():
+ test_to_names.setdefault(test_function, []).append(name)
+
+ name_to_aliases = {}
+ for names in test_to_names.itervalues():
+ names.sort(key=lambda name: len(name))
+ name_to_aliases[names[0]] = names[1:]
+
+ print
+ print "Available tests:"
+ print "----------------"
+ for name, aliases in sorted(name_to_aliases.iteritems()):
+ if aliases:
+ print " {} (aka {})".format(name, ', '.join(aliases))
+ else:
+ print " {}".format(name)
+
+ def SetupLdPath(self, requires_build_dir):
+ if requires_build_dir:
+ self._EnsureBuildDirFound()
+ elif not self._options.build_dir:
+ return
+
+ # Append build_dir to LD_LIBRARY_PATH so external libraries can be loaded.
+ if (os.getenv("LD_LIBRARY_PATH")):
+ os.putenv("LD_LIBRARY_PATH", "%s:%s" % (os.getenv("LD_LIBRARY_PATH"),
+ self._options.build_dir))
+ else:
+ os.putenv("LD_LIBRARY_PATH", self._options.build_dir)
+
+ def SimpleTest(self, module, name, valgrind_test_args=None, cmd_args=None):
+ tool = valgrind_test.CreateTool(self._options.valgrind_tool)
+ cmd = self._DefaultCommand(tool, name, valgrind_test_args)
+ self._AppendGtestFilter(tool, name, cmd)
+ cmd.extend(['--test-tiny-timeout=1000'])
+ if cmd_args:
+ cmd.extend(cmd_args)
+
+ self.SetupLdPath(True)
+ return tool.Run(cmd, module)
+
+ def RunCmdLine(self):
+ tool = valgrind_test.CreateTool(self._options.valgrind_tool)
+ cmd = self._DefaultCommand(tool, None, self._args)
+ self.SetupLdPath(False)
+ return tool.Run(cmd, None)
+
+ def TestAccessibility(self):
+ return self.SimpleTest("accessibility", "accessibility_unittests")
+
+ def TestAddressInput(self):
+ return self.SimpleTest("addressinput", "libaddressinput_unittests")
+
+ def TestAngle(self):
+ return self.SimpleTest("angle", "angle_unittests")
+
+ def TestAppList(self):
+ return self.SimpleTest("app_list", "app_list_unittests")
+
+ def TestAsh(self):
+ return self.SimpleTest("ash", "ash_unittests")
+
+ def TestAura(self):
+ return self.SimpleTest("aura", "aura_unittests")
+
+ def TestBase(self):
+ return self.SimpleTest("base", "base_unittests")
+
+ def TestBlinkHeap(self):
+ return self.SimpleTest("blink_heap", "blink_heap_unittests")
+
+ def TestBlinkPlatform(self):
+ return self.SimpleTest("blink_platform", "blink_platform_unittests")
+
+ def TestCacheInvalidation(self):
+ return self.SimpleTest("cacheinvalidation", "cacheinvalidation_unittests")
+
+ def TestCast(self):
+ return self.SimpleTest("chrome", "cast_unittests")
+
+ def TestCC(self):
+ return self.SimpleTest("cc", "cc_unittests",
+ cmd_args=[
+ "--cc-layer-tree-test-long-timeout"])
+
+ def TestChromeApp(self):
+ return self.SimpleTest("chrome_app", "chrome_app_unittests")
+
+ def TestChromeElf(self):
+ return self.SimpleTest("chrome_elf", "chrome_elf_unittests")
+
+ def TestChromeDriver(self):
+ return self.SimpleTest("chromedriver", "chromedriver_unittests")
+
+ def TestChromeOS(self):
+ return self.SimpleTest("chromeos", "chromeos_unittests")
+
+ def TestComponents(self):
+ return self.SimpleTest("components", "components_unittests")
+
+ def TestCompositor(self):
+ return self.SimpleTest("compositor", "compositor_unittests")
+
+ def TestContent(self):
+ return self.SimpleTest("content", "content_unittests")
+
+ def TestCourgette(self):
+ return self.SimpleTest("courgette", "courgette_unittests")
+
+ def TestCrypto(self):
+ return self.SimpleTest("crypto", "crypto_unittests")
+
+ def TestDevice(self):
+ return self.SimpleTest("device", "device_unittests")
+
+ def TestDisplay(self):
+ return self.SimpleTest("display", "display_unittests")
+
+ def TestEvents(self):
+ return self.SimpleTest("events", "events_unittests")
+
+ def TestExtensions(self):
+ return self.SimpleTest("extensions", "extensions_unittests")
+
+ def TestFFmpegRegressions(self):
+ return self.SimpleTest("chrome", "ffmpeg_regression_tests")
+
+ def TestGCM(self):
+ return self.SimpleTest("gcm", "gcm_unit_tests")
+
+ def TestGfx(self):
+ return self.SimpleTest("gfx", "gfx_unittests")
+
+ def TestGin(self):
+ return self.SimpleTest("gin", "gin_unittests")
+
+ def TestGoogleApis(self):
+ return self.SimpleTest("google_apis", "google_apis_unittests")
+
+ def TestGPU(self):
+ return self.SimpleTest("gpu", "gpu_unittests")
+
+ def TestIpc(self):
+ return self.SimpleTest("ipc", "ipc_tests",
+ valgrind_test_args=["--trace_children"])
+
+ def TestInstallerUtil(self):
+ return self.SimpleTest("installer_util", "installer_util_unittests")
+
+ def TestInstallStatic(self):
+ return self.SimpleTest("install_static", "install_static_unittests")
+
+ def TestJingle(self):
+ return self.SimpleTest("chrome", "jingle_unittests")
+
+ def TestKeyboard(self):
+ return self.SimpleTest("keyboard", "keyboard_unittests")
+
+ def TestLatency(self):
+ return self.SimpleTest("latency", "latency_unittests")
+
+ def TestMedia(self):
+ return self.SimpleTest("chrome", "media_unittests")
+
+ def TestMessageCenter(self):
+ return self.SimpleTest("message_center", "message_center_unittests")
+
+ def TestMidi(self):
+ return self.SimpleTest("chrome", "midi_unittests")
+
+ def TestMojoCommon(self):
+ return self.SimpleTest("mojo_common", "mojo_common_unittests")
+
+ def TestMojoPublicBindings(self):
+ return self.SimpleTest("mojo_public_bindings",
+ "mojo_public_bindings_unittests")
+
+ def TestMojoPublicSystem(self):
+ return self.SimpleTest("mojo_public_system",
+ "mojo_public_system_unittests")
+
+ def TestMojoPublicSysPerf(self):
+ return self.SimpleTest("mojo_public_sysperf",
+ "mojo_public_system_perftests")
+
+ def TestMojoSystem(self):
+ return self.SimpleTest("mojo_system", "mojo_system_unittests")
+
+ def TestNet(self):
+ return self.SimpleTest("net", "net_unittests")
+
+ def TestNetPerf(self):
+ return self.SimpleTest("net", "net_perftests")
+
+ def TestPhoneNumber(self):
+ return self.SimpleTest("phonenumber", "libphonenumber_unittests")
+
+ def TestPPAPI(self):
+ return self.SimpleTest("chrome", "ppapi_unittests")
+
+ def TestPrinting(self):
+ return self.SimpleTest("chrome", "printing_unittests")
+
+ def TestRemoting(self):
+ return self.SimpleTest("chrome", "remoting_unittests",
+ cmd_args=[
+ "--ui-test-action-timeout=60000",
+ "--ui-test-action-max-timeout=150000"])
+
+ def TestSkia(self):
+ return self.SimpleTest("skia", "skia_unittests")
+
+ def TestSql(self):
+ return self.SimpleTest("chrome", "sql_unittests")
+
+ def TestStorage(self):
+ return self.SimpleTest("storage", "storage_unittests")
+
+ def TestLinuxSandbox(self):
+ return self.SimpleTest("sandbox", "sandbox_linux_unittests")
+
+ def TestUnit(self):
+ # http://crbug.com/51716
+ # Disabling all unit tests
+ # Problems reappeared after r119922
+ if common.IsMac() and (self._options.valgrind_tool == "memcheck"):
+ logging.warning("unit_tests are disabled for memcheck on MacOS.")
+ return 0;
+ return self.SimpleTest("chrome", "unit_tests")
+
+ def TestUIBaseUnit(self):
+ return self.SimpleTest("chrome", "ui_base_unittests")
+
+ def TestUIChromeOS(self):
+ return self.SimpleTest("chrome", "ui_chromeos_unittests")
+
+ def TestURL(self):
+ return self.SimpleTest("chrome", "url_unittests")
+
+ def TestViews(self):
+ return self.SimpleTest("views", "views_unittests")
+
+
+ # Valgrind timeouts are in seconds.
+ UI_VALGRIND_ARGS = ["--timeout=14400", "--trace_children", "--indirect"]
+ # UI test timeouts are in milliseconds.
+ UI_TEST_ARGS = ["--ui-test-action-timeout=60000",
+ "--ui-test-action-max-timeout=150000",
+ "--no-sandbox"]
+
+ # TODO(thestig) fine-tune these values.
+ # Valgrind timeouts are in seconds.
+ BROWSER_VALGRIND_ARGS = ["--timeout=50000", "--trace_children", "--indirect"]
+ # Browser test timeouts are in milliseconds.
+ BROWSER_TEST_ARGS = ["--ui-test-action-timeout=400000",
+ "--ui-test-action-max-timeout=800000",
+ "--no-sandbox"]
+
+ def TestBrowser(self):
+ return self.SimpleTest("chrome", "browser_tests",
+ valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
+ cmd_args=self.BROWSER_TEST_ARGS)
+
+ def TestContentBrowser(self):
+ return self.SimpleTest("content", "content_browsertests",
+ valgrind_test_args=self.BROWSER_VALGRIND_ARGS,
+ cmd_args=self.BROWSER_TEST_ARGS)
+
+ def TestInteractiveUI(self):
+ return self.SimpleTest("chrome", "interactive_ui_tests",
+ valgrind_test_args=self.UI_VALGRIND_ARGS,
+ cmd_args=self.UI_TEST_ARGS)
+
+ def TestSyncIntegration(self):
+ return self.SimpleTest("chrome", "sync_integration_tests",
+ valgrind_test_args=self.UI_VALGRIND_ARGS,
+ cmd_args=(["--ui-test-action-max-timeout=450000"]))
+
+ def TestLayoutChunk(self, chunk_num, chunk_size):
+ # Run tests [chunk_num*chunk_size .. (chunk_num+1)*chunk_size) from the
+ # list of tests. Wrap around to beginning of list at end.
+ # If chunk_size is zero, run all tests in the list once.
+ # If a text file is given as argument, it is used as the list of tests.
+ assert((chunk_size == 0) != (len(self._args) == 0))
+ # Build the ginormous commandline in 'cmd'.
+ # It's going to be roughly
+ # python valgrind_test.py ...
+ # but we'll use the --indirect flag to valgrind_test.py
+ # to avoid valgrinding python.
+ # Start by building the valgrind_test.py commandline.
+ tool = valgrind_test.CreateTool(self._options.valgrind_tool)
+ cmd = self._DefaultCommand(tool)
+ cmd.append("--trace_children")
+ cmd.append("--indirect_webkit_layout")
+ cmd.append("--ignore_exit_code")
+ # Now build script_cmd, the run-webkits-tests commandline.
+ # Store each chunk in its own directory so that we can find the data later
+ chunk_dir = os.path.join("layout", "chunk_%05d" % chunk_num)
+ out_dir = os.path.join(path_utils.ScriptDir(), "latest")
+ out_dir = os.path.join(out_dir, chunk_dir)
+ if os.path.exists(out_dir):
+ old_files = glob.glob(os.path.join(out_dir, "*.txt"))
+ for f in old_files:
+ os.remove(f)
+ else:
+ os.makedirs(out_dir)
+ script = os.path.join(self._source_dir, "third_party", "WebKit", "Tools",
+ "Scripts", "run-webkit-tests")
+ # http://crbug.com/260627: After the switch to content_shell from DRT, each
+ # test now brings up 3 processes. Under Valgrind, they become memory bound
+ # and can eventually OOM if we don't reduce the total count.
+ # It'd be nice if content_shell automatically throttled the startup of new
+ # tests if we're low on memory.
+ jobs = max(1, int(multiprocessing.cpu_count() * 0.3))
+ script_cmd = ["python", script, "-v",
+ # run a separate DumpRenderTree for each test
+ "--batch-size=1",
+ "--fully-parallel",
+ "--child-processes=%d" % jobs,
+ "--time-out-ms=800000",
+ "--no-retry-failures", # retrying takes too much time
+ # http://crbug.com/176908: Don't launch a browser when done.
+ "--no-show-results",
+ "--nocheck-sys-deps",
+ "--additional-driver-flag=--no-sandbox"]
+ # Pass build mode to run-webkit-tests. We aren't passed it directly,
+ # so parse it out of build_dir. run-webkit-tests can only handle
+ # the two values "Release" and "Debug".
+ # TODO(Hercules): unify how all our scripts pass around build mode
+ # (--mode / --target / --build-dir / --debug)
+ if self._options.build_dir:
+ build_root, mode = os.path.split(self._options.build_dir)
+ script_cmd.extend(["--build-directory", build_root, "--target", mode])
+ if (chunk_size > 0):
+ script_cmd.append("--run-chunk=%d:%d" % (chunk_num, chunk_size))
+ if len(self._args):
+ # if the arg is a txt file, then treat it as a list of tests
+ if os.path.isfile(self._args[0]) and self._args[0][-4:] == ".txt":
+ script_cmd.append("--test-list=%s" % self._args[0])
+ else:
+ script_cmd.extend(self._args)
+ self._AppendGtestFilter(tool, "layout", script_cmd)
+ # Now run script_cmd with the wrapper in cmd
+ cmd.extend(["--"])
+ cmd.extend(script_cmd)
+
+ # Layout tests often times fail quickly, but the buildbot remains green.
+ # Detect this situation when running with the default chunk size.
+ if chunk_size == self.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE:
+ min_runtime_in_seconds=120
+ else:
+ min_runtime_in_seconds=0
+ ret = tool.Run(cmd, "layout", min_runtime_in_seconds=min_runtime_in_seconds)
+ return ret
+
+
+ def TestLayout(self):
+ # A "chunk file" is maintained in the local directory so that each test
+ # runs a slice of the layout tests of size chunk_size that increments with
+ # each run. Since tests can be added and removed from the layout tests at
+ # any time, this is not going to give exact coverage, but it will allow us
+ # to continuously run small slices of the layout tests under valgrind rather
+ # than having to run all of them in one shot.
+ chunk_size = self._options.num_tests
+ if chunk_size == 0 or len(self._args):
+ return self.TestLayoutChunk(0, 0)
+ chunk_num = 0
+ chunk_file = os.path.join("valgrind_layout_chunk.txt")
+ logging.info("Reading state from " + chunk_file)
+ try:
+ f = open(chunk_file)
+ if f:
+ chunk_str = f.read()
+ if len(chunk_str):
+ chunk_num = int(chunk_str)
+ # This should be enough so that we have a couple of complete runs
+ # of test data stored in the archive (although note that when we loop
+ # that we almost guaranteed won't be at the end of the test list)
+ if chunk_num > 10000:
+ chunk_num = 0
+ f.close()
+ except IOError, (errno, strerror):
+ logging.error("error reading from file %s (%d, %s)" % (chunk_file,
+ errno, strerror))
+ # Save the new chunk size before running the tests. Otherwise if a
+ # particular chunk hangs the bot, the chunk number will never get
+ # incremented and the bot will be wedged.
+ logging.info("Saving state to " + chunk_file)
+ try:
+ f = open(chunk_file, "w")
+ chunk_num += 1
+ f.write("%d" % chunk_num)
+ f.close()
+ except IOError, (errno, strerror):
+ logging.error("error writing to file %s (%d, %s)" % (chunk_file, errno,
+ strerror))
+ # Since we're running small chunks of the layout tests, it's important to
+ # mark the ones that have errors in them. These won't be visible in the
+ # summary list for long, but will be useful for someone reviewing this bot.
+ return self.TestLayoutChunk(chunk_num, chunk_size)
+
+ # The known list of tests.
+ # Recognise the original abbreviations as well as full executable names.
+ _test_list = {
+ "cmdline" : RunCmdLine,
+ "addressinput": TestAddressInput,
+ "libaddressinput_unittests": TestAddressInput,
+ "accessibility": TestAccessibility,
+ "angle": TestAngle, "angle_unittests": TestAngle,
+ "app_list": TestAppList, "app_list_unittests": TestAppList,
+ "ash": TestAsh, "ash_unittests": TestAsh,
+ "aura": TestAura, "aura_unittests": TestAura,
+ "base": TestBase, "base_unittests": TestBase,
+ "blink_heap": TestBlinkHeap,
+ "blink_platform": TestBlinkPlatform,
+ "browser": TestBrowser, "browser_tests": TestBrowser,
+ "cacheinvalidation": TestCacheInvalidation,
+ "cacheinvalidation_unittests": TestCacheInvalidation,
+ "cast": TestCast, "cast_unittests": TestCast,
+ "cc": TestCC, "cc_unittests": TestCC,
+ "chrome_app": TestChromeApp,
+ "chrome_elf": TestChromeElf,
+ "chromedriver": TestChromeDriver,
+ "chromeos": TestChromeOS, "chromeos_unittests": TestChromeOS,
+ "components": TestComponents,"components_unittests": TestComponents,
+ "compositor": TestCompositor,"compositor_unittests": TestCompositor,
+ "content": TestContent, "content_unittests": TestContent,
+ "content_browsertests": TestContentBrowser,
+ "courgette": TestCourgette, "courgette_unittests": TestCourgette,
+ "crypto": TestCrypto, "crypto_unittests": TestCrypto,
+ "device": TestDevice, "device_unittests": TestDevice,
+ "display": TestDisplay, "display_unittests": TestDisplay,
+ "events": TestEvents, "events_unittests": TestEvents,
+ "extensions": TestExtensions, "extensions_unittests": TestExtensions,
+ "ffmpeg_regression_tests": TestFFmpegRegressions,
+ "gcm": TestGCM, "gcm_unit_tests": TestGCM,
+ "gin": TestGin, "gin_unittests": TestGin,
+ "gfx": TestGfx, "gfx_unittests": TestGfx,
+ "google_apis": TestGoogleApis,
+ "gpu": TestGPU, "gpu_unittests": TestGPU,
+ "ipc": TestIpc, "ipc_tests": TestIpc,
+ "installer_util": TestInstallerUtil,
+ "installer_util_unittests": TestInstallerUtil,
+ "install_static_unittests": TestInstallStatic,
+ "interactive_ui": TestInteractiveUI,
+ "jingle": TestJingle, "jingle_unittests": TestJingle,
+ "keyboard": TestKeyboard, "keyboard_unittests": TestKeyboard,
+ "latency": TestLatency, "latency_unittests": TestLatency,
+ "layout": TestLayout, "layout_tests": TestLayout,
+ "media": TestMedia, "media_unittests": TestMedia,
+ "message_center": TestMessageCenter,
+ "message_center_unittests" : TestMessageCenter,
+ "midi": TestMidi, "midi_unittests": TestMidi,
+ "mojo_common": TestMojoCommon,
+ "mojo_common_unittests": TestMojoCommon,
+ "mojo_system": TestMojoSystem,
+ "mojo_system_unittests": TestMojoSystem,
+ "mojo_public_system": TestMojoPublicSystem,
+ "mojo_public_system_unittests": TestMojoPublicSystem,
+ "mojo_public_bindings": TestMojoPublicBindings,
+ "mojo_public_bindings_unittests": TestMojoPublicBindings,
+ "mojo_public_sysperf": TestMojoPublicSysPerf,
+ "net": TestNet, "net_unittests": TestNet,
+ "net_perf": TestNetPerf, "net_perftests": TestNetPerf,
+ "phonenumber": TestPhoneNumber,
+ "libphonenumber_unittests": TestPhoneNumber,
+ "ppapi": TestPPAPI, "ppapi_unittests": TestPPAPI,
+ "printing": TestPrinting, "printing_unittests": TestPrinting,
+ "remoting": TestRemoting, "remoting_unittests": TestRemoting,
+ "sandbox": TestLinuxSandbox, "sandbox_linux_unittests": TestLinuxSandbox,
+ "skia": TestSkia, "skia_unittests": TestSkia,
+ "sql": TestSql, "sql_unittests": TestSql,
+ "storage": TestStorage, "storage_unittests": TestStorage,
+ "sync_integration_tests": TestSyncIntegration,
+ "sync_integration": TestSyncIntegration,
+ "ui_base_unit": TestUIBaseUnit, "ui_base_unittests": TestUIBaseUnit,
+ "ui_chromeos": TestUIChromeOS, "ui_chromeos_unittests": TestUIChromeOS,
+ "unit": TestUnit, "unit_tests": TestUnit,
+ "url": TestURL, "url_unittests": TestURL,
+ "views": TestViews, "views_unittests": TestViews,
+ "webkit": TestLayout,
+ }
+
+
+def _main():
+ parser = optparse.OptionParser("usage: %prog -b <dir> -t <test> "
+ "[-t <test> ...]")
+
+ parser.add_option("--help-tests", dest="help_tests", action="store_true",
+ default=False, help="List all available tests")
+ parser.add_option("-b", "--build-dir",
+ help="the location of the compiler output")
+ parser.add_option("--target", help="Debug or Release")
+ parser.add_option("-t", "--test", action="append", default=[],
+ help="which test to run, supports test:gtest_filter format "
+ "as well.")
+ parser.add_option("--baseline", action="store_true", default=False,
+ help="generate baseline data instead of validating")
+ parser.add_option("-f", "--force", action="store_true", default=False,
+ help="run a broken test anyway")
+ parser.add_option("--gtest_filter",
+ help="additional arguments to --gtest_filter")
+ parser.add_option("--gtest_repeat", help="argument for --gtest_repeat")
+ parser.add_option("--gtest_shuffle", action="store_true", default=False,
+ help="Randomize tests' orders on every iteration.")
+ parser.add_option("--gtest_break_on_failure", action="store_true",
+ default=False,
+ help="Drop in to debugger on assertion failure. Also "
+ "useful for forcing tests to exit with a stack dump "
+ "on the first assertion failure when running with "
+ "--gtest_repeat=-1")
+ parser.add_option("-v", "--verbose", action="store_true", default=False,
+ help="verbose output - enable debug log messages")
+ parser.add_option("--tool", dest="valgrind_tool", default="memcheck",
+ help="specify a valgrind tool to run the tests under")
+ parser.add_option("--tool_flags", dest="valgrind_tool_flags", default="",
+ help="specify custom flags for the selected valgrind tool")
+ parser.add_option("--keep_logs", action="store_true", default=False,
+ help="store memory tool logs in the <tool>.logs directory "
+ "instead of /tmp.\nThis can be useful for tool "
+ "developers/maintainers.\nPlease note that the <tool>"
+ ".logs directory will be clobbered on tool startup.")
+ parser.add_option("-n", "--num_tests", type="int",
+ default=ChromeTests.LAYOUT_TESTS_DEFAULT_CHUNK_SIZE,
+ help="for layout tests: # of subtests per run. 0 for all.")
+ parser.add_option("--test-launcher-bot-mode", action="store_true",
+ help="run the tests with --test-launcher-bot-mode")
+ parser.add_option("--test-launcher-total-shards", type=int,
+ help="run the tests with --test-launcher-total-shards")
+ parser.add_option("--test-launcher-shard-index", type=int,
+ help="run the tests with --test-launcher-shard-index")
+ parser.add_option("--drmemory_ops",
+ help="extra options passed to Dr. Memory")
+
+ options, args = parser.parse_args()
+
+ # Bake target into build_dir.
+ if options.target and options.build_dir:
+ assert (options.target !=
+ os.path.basename(os.path.dirname(options.build_dir)))
+ options.build_dir = os.path.join(os.path.abspath(options.build_dir),
+ options.target)
+
+ if options.verbose:
+ logging_utils.config_root(logging.DEBUG)
+ else:
+ logging_utils.config_root()
+
+ if options.help_tests:
+ ChromeTests.ShowTests()
+ return 0
+
+ if not options.test:
+ parser.error("--test not specified")
+
+ if len(options.test) != 1 and options.gtest_filter:
+ parser.error("--gtest_filter and multiple tests don't make sense together")
+
+ BROKEN_TESTS = {
+ 'drmemory_light': [
+ 'addressinput',
+ 'aura',
+ 'base_unittests',
+ 'cc',
+ 'components', # x64 only?
+ 'content',
+ 'gfx',
+ 'mojo_public_bindings',
+ ],
+ 'drmemory_full': [
+ 'addressinput',
+ 'aura',
+ 'base_unittests',
+ 'blink_heap',
+ 'blink_platform',
+ 'browser_tests',
+ 'cast',
+ 'cc',
+ 'chromedriver',
+ 'compositor',
+ 'content',
+ 'content_browsertests',
+ 'device',
+ 'events',
+ 'extensions',
+ 'gfx',
+ 'google_apis',
+ 'gpu',
+ 'ipc_tests',
+ 'jingle',
+ 'keyboard',
+ 'media',
+ 'midi',
+ 'mojo_common',
+ 'mojo_public_bindings',
+ 'mojo_public_sysperf',
+ 'mojo_public_system',
+ 'mojo_system',
+ 'net',
+ 'remoting',
+ 'unit',
+ 'url',
+ ],
+ }
+
+ for t in options.test:
+ if t in BROKEN_TESTS[options.valgrind_tool] and not options.force:
+ logging.info("Skipping broken %s test %s -- see crbug.com/633693" %
+ (options.valgrind_tool, t))
+ return 0
+
+ tests = ChromeTests(options, args, t)
+ ret = tests.Run()
+ if ret: return ret
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/chrome_tests.sh b/files/tools_libyuv/valgrind/chrome_tests.sh
new file mode 100755
index 0000000..dc17684
--- /dev/null
+++ b/files/tools_libyuv/valgrind/chrome_tests.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Set up some paths and re-direct the arguments to chrome_tests.py
+
+export THISDIR=`dirname $0`
+ARGV_COPY="$@"
+
+# We need to set CHROME_VALGRIND iff using Memcheck:
+# tools/valgrind/chrome_tests.sh --tool memcheck
+# or
+# tools/valgrind/chrome_tests.sh --tool=memcheck
+tool="memcheck" # Default to memcheck.
+while (( "$#" ))
+do
+ if [[ "$1" == "--tool" ]]
+ then
+ tool="$2"
+ shift
+ elif [[ "$1" =~ --tool=(.*) ]]
+ then
+ tool="${BASH_REMATCH[1]}"
+ fi
+ shift
+done
+
+NEEDS_VALGRIND=0
+NEEDS_DRMEMORY=0
+
+case "$tool" in
+ "memcheck")
+ NEEDS_VALGRIND=1
+ ;;
+ "drmemory" | "drmemory_light" | "drmemory_full" | "drmemory_pattern")
+ NEEDS_DRMEMORY=1
+ ;;
+esac
+
+if [ "$NEEDS_VALGRIND" == "1" ]
+then
+ export CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
+ if [ "$CHROME_VALGRIND" = "" ]
+ then
+ # locate_valgrind.sh failed
+ exit 1
+ fi
+ echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+ PATH="${CHROME_VALGRIND}/bin:$PATH"
+ # We need to set these variables to override default lib paths hard-coded into
+ # Valgrind binary.
+ export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+ export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+ # Clean up some /tmp directories that might be stale due to interrupted
+ # chrome_tests.py execution.
+ # FYI:
+ # -mtime +1 <- only print files modified more than 24h ago,
+ # -print0/-0 are needed to handle possible newlines in the filenames.
+ echo "Cleanup /tmp from Valgrind stuff"
+ find /tmp -maxdepth 1 \(\
+ -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
+ \) -mtime +1 -print0 | xargs -0 rm -rf
+fi
+
+if [ "$NEEDS_DRMEMORY" == "1" ]
+then
+ if [ -z "$DRMEMORY_COMMAND" ]
+ then
+ DRMEMORY_PATH="$THISDIR/../../third_party/drmemory"
+ DRMEMORY_SFX="$DRMEMORY_PATH/drmemory-windows-sfx.exe"
+ if [ ! -f "$DRMEMORY_SFX" ]
+ then
+ echo "Can't find Dr. Memory executables."
+ echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+ echo "for the instructions on how to get them."
+ exit 1
+ fi
+
+ chmod +x "$DRMEMORY_SFX" # Cygwin won't run it without +x.
+ "$DRMEMORY_SFX" -o"$DRMEMORY_PATH/unpacked" -y
+ export DRMEMORY_COMMAND="$DRMEMORY_PATH/unpacked/bin/drmemory.exe"
+ fi
+fi
+
+PYTHONPATH=$THISDIR/../python/google python \
+ "$THISDIR/chrome_tests.py" $ARGV_COPY
diff --git a/files/tools_libyuv/valgrind/common.py b/files/tools_libyuv/valgrind/common.py
new file mode 100644
index 0000000..e9ee51e
--- /dev/null
+++ b/files/tools_libyuv/valgrind/common.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import logging
+import platform
+import os
+import signal
+import subprocess
+import sys
+import time
+
+
+class NotImplementedError(Exception):
+ pass
+
+
+class TimeoutError(Exception):
+ pass
+
+
+def RunSubprocessInBackground(proc):
+ """Runs a subprocess in the background. Returns a handle to the process."""
+ logging.info("running %s in the background" % " ".join(proc))
+ return subprocess.Popen(proc)
+
+
+def RunSubprocess(proc, timeout=0):
+ """ Runs a subprocess, until it finishes or |timeout| is exceeded and the
+ process is killed with taskkill. A |timeout| <= 0 means no timeout.
+
+ Args:
+ proc: list of process components (exe + args)
+ timeout: how long to wait before killing, <= 0 means wait forever
+ """
+
+ logging.info("running %s, timeout %d sec" % (" ".join(proc), timeout))
+ sys.stdout.flush()
+ sys.stderr.flush()
+
+ # Manually read and print out stdout and stderr.
+ # By default, the subprocess is supposed to inherit these from its parent,
+ # however when run under buildbot, it seems unable to read data from a
+ # grandchild process, so we have to read the child and print the data as if
+ # it came from us for buildbot to read it. We're not sure why this is
+ # necessary.
+ # TODO(erikkay): should we buffer stderr and stdout separately?
+ p = subprocess.Popen(proc, universal_newlines=True,
+ bufsize=0, # unbuffered
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+ logging.info("started subprocess")
+
+ did_timeout = False
+ if timeout > 0:
+ wait_until = time.time() + timeout
+ while p.poll() is None and not did_timeout:
+ # Have to use readline rather than readlines() or "for line in p.stdout:",
+ # otherwise we get buffered even with bufsize=0.
+ line = p.stdout.readline()
+ while line and not did_timeout:
+ sys.stdout.write(line)
+ sys.stdout.flush()
+ line = p.stdout.readline()
+ if timeout > 0:
+ did_timeout = time.time() > wait_until
+
+ if did_timeout:
+ logging.info("process timed out")
+ else:
+ logging.info("process ended, did not time out")
+
+ if did_timeout:
+ if IsWindows():
+ subprocess.call(["taskkill", "/T", "/F", "/PID", str(p.pid)])
+ else:
+ # Does this kill all children, too?
+ os.kill(p.pid, signal.SIGINT)
+ logging.error("KILLED %d" % p.pid)
+ # Give the process a chance to actually die before continuing
+ # so that cleanup can happen safely.
+ time.sleep(1.0)
+ logging.error("TIMEOUT waiting for %s" % proc[0])
+ raise TimeoutError(proc[0])
+ else:
+ for line in p.stdout:
+ sys.stdout.write(line)
+ if not IsMac(): # stdout flush fails on Mac
+ logging.info("flushing stdout")
+ sys.stdout.flush()
+
+ logging.info("collecting result code")
+ result = p.poll()
+ if result:
+ logging.error("%s exited with non-zero result code %d" % (proc[0], result))
+ return result
+
+
+def IsLinux():
+ return sys.platform.startswith('linux')
+
+
+def IsMac():
+ return sys.platform.startswith('darwin')
+
+
+def IsWindows():
+ return sys.platform == 'cygwin' or sys.platform.startswith('win')
+
+
+def WindowsVersionName():
+ """Returns the name of the Windows version if it is known, or None.
+
+ Possible return values are: xp, vista, 7, 8, or None
+ """
+ if sys.platform == 'cygwin':
+ # Windows version number is hiding in system name. Looks like:
+ # CYGWIN_NT-6.1-WOW64
+ try:
+ version_str = platform.uname()[0].split('-')[1]
+ except:
+ return None
+ elif sys.platform.startswith('win'):
+ # Normal Windows version string. Mine: 6.1.7601
+ version_str = platform.version()
+ else:
+ return None
+
+ parts = version_str.split('.')
+ try:
+ major = int(parts[0])
+ minor = int(parts[1])
+ except:
+ return None # Can't parse, unknown version.
+
+ if major == 5:
+ return 'xp'
+ elif major == 6 and minor == 0:
+ return 'vista'
+ elif major == 6 and minor == 1:
+ return '7'
+ elif major == 6 and minor == 2:
+ return '8' # Future proof. ;)
+ return None
+
+
+def PlatformNames():
+ """Return an array of string to be used in paths for the platform
+ (e.g. suppressions, gtest filters, ignore files etc.)
+ The first element of the array describes the 'main' platform
+ """
+ if IsLinux():
+ return ['linux']
+ if IsMac():
+ return ['mac']
+ if IsWindows():
+ names = ['win32']
+ version_name = WindowsVersionName()
+ if version_name is not None:
+ names.append('win-%s' % version_name)
+ return names
+ raise NotImplementedError('Unknown platform "%s".' % sys.platform)
+
+
+def PutEnvAndLog(env_name, env_value):
+ os.putenv(env_name, env_value)
+ logging.info('export %s=%s', env_name, env_value)
+
+def BoringCallers(mangled, use_re_wildcards):
+ """Return a list of 'boring' function names (optinally mangled)
+ with */? wildcards (optionally .*/.).
+ Boring = we drop off the bottom of stack traces below such functions.
+ """
+
+ need_mangling = [
+ # Don't show our testing framework:
+ ("testing::Test::Run", "_ZN7testing4Test3RunEv"),
+ ("testing::TestInfo::Run", "_ZN7testing8TestInfo3RunEv"),
+ ("testing::internal::Handle*ExceptionsInMethodIfSupported*",
+ "_ZN7testing8internal3?Handle*ExceptionsInMethodIfSupported*"),
+
+ # Depend on scheduling:
+ ("MessageLoop::Run", "_ZN11MessageLoop3RunEv"),
+ ("MessageLoop::RunTask", "_ZN11MessageLoop7RunTask*"),
+ ("RunnableMethod*", "_ZN14RunnableMethod*"),
+ ("DispatchToMethod*", "_Z*16DispatchToMethod*"),
+ ("base::internal::Invoker*::DoInvoke*",
+ "_ZN4base8internal8Invoker*DoInvoke*"), # Invoker{1,2,3}
+ ("base::internal::RunnableAdapter*::Run*",
+ "_ZN4base8internal15RunnableAdapter*Run*"),
+ ]
+
+ ret = []
+ for pair in need_mangling:
+ ret.append(pair[1 if mangled else 0])
+
+ ret += [
+ # Also don't show the internals of libc/pthread.
+ "start_thread",
+ "main",
+ "BaseThreadInitThunk",
+ ]
+
+ if use_re_wildcards:
+ for i in range(0, len(ret)):
+ ret[i] = ret[i].replace('*', '.*').replace('?', '.')
+
+ return ret
+
+def NormalizeWindowsPath(path):
+ """If we're using Cygwin Python, turn the path into a Windows path.
+
+ Don't turn forward slashes into backslashes for easier copy-pasting and
+ escaping.
+
+ TODO(rnk): If we ever want to cut out the subprocess invocation, we can use
+ _winreg to get the root Cygwin directory from the registry key:
+ HKEY_LOCAL_MACHINE\SOFTWARE\Cygwin\setup\rootdir.
+ """
+ if sys.platform.startswith("cygwin"):
+ p = subprocess.Popen(["cygpath", "-m", path],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ (out, err) = p.communicate()
+ if err:
+ logging.warning("WARNING: cygpath error: %s", err)
+ return out.strip()
+ else:
+ return path
+
+############################
+# Common output format code
+
+def PrintUsedSuppressionsList(suppcounts):
+ """ Prints out the list of used suppressions in a format common to all the
+ memory tools. If the list is empty, prints nothing and returns False,
+ otherwise True.
+
+ suppcounts: a dictionary of used suppression counts,
+ Key -> name, Value -> count.
+ """
+ if not suppcounts:
+ return False
+
+ print "-----------------------------------------------------"
+ print "Suppressions used:"
+ print " count name"
+ for (name, count) in sorted(suppcounts.items(), key=lambda (k,v): (v,k)):
+ print "%7d %s" % (count, name)
+ print "-----------------------------------------------------"
+ sys.stdout.flush()
+ return True
diff --git a/files/tools_libyuv/valgrind/gdb_helper.py b/files/tools_libyuv/valgrind/gdb_helper.py
new file mode 100644
index 0000000..d127f76
--- /dev/null
+++ b/files/tools_libyuv/valgrind/gdb_helper.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+''' A bunch of helper functions for querying gdb.'''
+
+import logging
+import os
+import re
+import tempfile
+
+GDB_LINE_RE = re.compile(r'Line ([0-9]*) of "([^"]*)".*')
+
+def _GdbOutputToFileLine(output_line):
+ ''' Parse the gdb output line, return a pair (file, line num) '''
+ match = GDB_LINE_RE.match(output_line)
+ if match:
+ return match.groups()[1], match.groups()[0]
+ else:
+ return None
+
+def ResolveAddressesWithinABinary(binary_name, load_address, address_list):
+ ''' For each address, return a pair (file, line num) '''
+ commands = tempfile.NamedTemporaryFile()
+ commands.write('add-symbol-file "%s" %s\n' % (binary_name, load_address))
+ for addr in address_list:
+ commands.write('info line *%s\n' % addr)
+ commands.write('quit\n')
+ commands.flush()
+ gdb_commandline = 'gdb -batch -x %s 2>/dev/null' % commands.name
+ gdb_pipe = os.popen(gdb_commandline)
+ result = gdb_pipe.readlines()
+
+ address_count = 0
+ ret = {}
+ for line in result:
+ if line.startswith('Line'):
+ ret[address_list[address_count]] = _GdbOutputToFileLine(line)
+ address_count += 1
+ if line.startswith('No line'):
+ ret[address_list[address_count]] = (None, None)
+ address_count += 1
+ gdb_pipe.close()
+ commands.close()
+ return ret
+
+class AddressTable(object):
+ ''' Object to do batched line number lookup. '''
+ def __init__(self):
+ self._load_addresses = {}
+ self._binaries = {}
+ self._all_resolved = False
+
+ def AddBinaryAt(self, binary, load_address):
+ ''' Register a new shared library or executable. '''
+ self._load_addresses[binary] = load_address
+
+ def Add(self, binary, address):
+ ''' Register a lookup request. '''
+ if binary == '':
+ logging.warn('adding address %s in empty binary?' % address)
+ if binary in self._binaries:
+ self._binaries[binary].append(address)
+ else:
+ self._binaries[binary] = [address]
+ self._all_resolved = False
+
+ def ResolveAll(self):
+ ''' Carry out all lookup requests. '''
+ self._translation = {}
+ for binary in self._binaries.keys():
+ if binary != '' and binary in self._load_addresses:
+ load_address = self._load_addresses[binary]
+ addr = ResolveAddressesWithinABinary(
+ binary, load_address, self._binaries[binary])
+ self._translation[binary] = addr
+ self._all_resolved = True
+
+ def GetFileLine(self, binary, addr):
+ ''' Get the (filename, linenum) result of a previously-registered lookup
+ request.
+ '''
+ if self._all_resolved:
+ if binary in self._translation:
+ if addr in self._translation[binary]:
+ return self._translation[binary][addr]
+ return (None, None)
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.bat b/files/tools_libyuv/valgrind/libyuv_tests.bat
index e37f09e..5fceca6 100644
--- a/files/tools_libyuv/valgrind/libyuv_tests.bat
+++ b/files/tools_libyuv/valgrind/libyuv_tests.bat
@@ -1,79 +1,79 @@
-@echo off
-:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
-::
-:: Use of this source code is governed by a BSD-style license
-:: that can be found in the LICENSE file in the root of the source
-:: tree. An additional intellectual property rights grant can be found
-:: in the file PATENTS. All contributing project authors may
-:: be found in the AUTHORS file in the root of the source tree.
-
-:: This script is a copy of chrome_tests.bat with the following changes:
-:: - Invokes libyuv_tests.py instead of chrome_tests.py
-:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
-:: it possible to execute the Python scripts properly.
-
-:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
-set THISDIR=%~dp0
-set TOOL_NAME="unknown"
-
-:: Get the tool name and put it into TOOL_NAME {{{1
-:: NB: SHIFT command doesn't modify %*
-:PARSE_ARGS_LOOP
- if %1 == () GOTO:TOOLNAME_NOT_FOUND
- if %1 == --tool GOTO:TOOLNAME_FOUND
- SHIFT
- goto :PARSE_ARGS_LOOP
-
-:TOOLNAME_NOT_FOUND
-echo "Please specify a tool (tsan or drmemory) by using --tool flag"
-exit /B 1
-
-:TOOLNAME_FOUND
-SHIFT
-set TOOL_NAME=%1
-:: }}}
-if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
-if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
-echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
-exit /B 1
-
-:SETUP_DRMEMORY
-if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
-:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
-set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
-set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
-if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
-echo "Can't find Dr. Memory executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:DRMEMORY_BINARY_OK
-%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
-set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
-:: }}}
-goto :RUN_TESTS
-
-:SETUP_TSAN
-:: Set up PIN_COMMAND to invoke TSan {{{1
-set TSAN_PATH=%THISDIR%..\..\third_party\tsan
-set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
-if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
-echo "Can't find ThreadSanitizer executables."
-echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
-echo "for the instructions on how to get them."
-exit /B 1
-
-:TSAN_BINARY_OK
-%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
-set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
-:: }}}
-goto :RUN_TESTS
-
-:RUN_TESTS
-set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
-set RUNNING_ON_VALGRIND=yes
-python %THISDIR%libyuv_tests.py %*
+@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS. All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+:: it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/files/tools_libyuv/valgrind/libyuv_tests.sh b/files/tools_libyuv/valgrind/libyuv_tests.sh
index 975b5e3..249032c 100755
--- a/files/tools_libyuv/valgrind/libyuv_tests.sh
+++ b/files/tools_libyuv/valgrind/libyuv_tests.sh
@@ -54,7 +54,7 @@
if [ "$NEEDS_VALGRIND" == "1" ]
then
- CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh`
+ CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
if [ "$CHROME_VALGRIND" = "" ]
then
CHROME_VALGRIND=../../src/third_party/valgrind/linux_x64
diff --git a/files/tools_libyuv/valgrind/locate_valgrind.sh b/files/tools_libyuv/valgrind/locate_valgrind.sh
new file mode 100755
index 0000000..d9594f4
--- /dev/null
+++ b/files/tools_libyuv/valgrind/locate_valgrind.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Prints a path to Valgrind binaries to be used for Chromium.
+# Select the valgrind from third_party/valgrind by default,
+# but allow users to override this default without editing scripts and
+# without specifying a commandline option
+
+export THISDIR=`dirname $0`
+
+# User may use their own valgrind by giving its path with CHROME_VALGRIND env.
+if [ "$CHROME_VALGRIND" = "" ]
+then
+ # Guess which binaries we should use by uname
+ case "$(uname -a)" in
+ *Linux*x86_64*)
+ PLATFORM="linux_x64"
+ ;;
+ *Linux*86*)
+ PLATFORM="linux_x86"
+ ;;
+ *Darwin*9.[678].[01]*i386*)
+ # Didn't test other kernels.
+ PLATFORM="mac"
+ ;;
+ *Darwin*10.[0-9].[0-9]*i386*)
+ PLATFORM="mac_10.6"
+ ;;
+ *Darwin*10.[0-9].[0-9]*x86_64*)
+ PLATFORM="mac_10.6"
+ ;;
+ *Darwin*11.[0-9].[0-9]*x86_64*)
+ PLATFORM="mac_10.7"
+ ;;
+ *)
+ (echo "Sorry, your platform is not supported:" &&
+ uname -a
+ echo
+ echo "If you're on Mac OS X, please see http://crbug.com/441425") >&2
+ exit 42
+ esac
+
+ # The binaries should be in third_party/valgrind
+ # (checked out from deps/third_party/valgrind/binaries).
+ CHROME_VALGRIND="$THISDIR/../../third_party/valgrind/$PLATFORM"
+
+ # TODO(timurrrr): readlink -f is not present on Mac...
+ if [ "$PLATFORM" != "mac" ] && \
+ [ "$PLATFORM" != "mac_10.6" ] && \
+ [ "$PLATFORM" != "mac_10.7" ]
+ then
+ # Get rid of all "../" dirs
+ CHROME_VALGRIND=$(readlink -f $CHROME_VALGRIND)
+ fi
+fi
+
+if ! test -x $CHROME_VALGRIND/bin/valgrind
+then
+ echo "Oops, could not find Valgrind binaries in your checkout." >&2
+ echo "Please see" >&2
+ echo " http://dev.chromium.org/developers/how-tos/using-valgrind/get-valgrind" >&2
+ echo "for the instructions on how to download pre-built binaries." >&2
+ exit 1
+fi
+
+echo $CHROME_VALGRIND
diff --git a/files/tools_libyuv/valgrind/memcheck/suppressions.txt b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
index 3ad0c8c..3f0f6d4 100644
--- a/files/tools_libyuv/valgrind/memcheck/suppressions.txt
+++ b/files/tools_libyuv/valgrind/memcheck/suppressions.txt
@@ -2,4 +2,20 @@
# It acts as a place holder for future additions for this project.
# It must exist for the Python wrapper script to work properly.
+# There are two of suppressions in this file.
+# 1. third_party libraries
+# 2. libyuv stuff
+# 3. libjingle stuff (talk folder)
+#-----------------------------------------------------------------------
+# third_party libraries
+{
+ bug_729
+ Memcheck:Free
+ fun:_ZdaPv
+ ...
+ fun:_ZN7testing8internal12UnitTestImplD1Ev
+ ...
+}
+
+# libyuv (empty so far)
diff --git a/files/tools_libyuv/valgrind/memcheck_analyze.py b/files/tools_libyuv/valgrind/memcheck_analyze.py
new file mode 100755
index 0000000..80e85eb
--- /dev/null
+++ b/files/tools_libyuv/valgrind/memcheck_analyze.py
@@ -0,0 +1,644 @@
+#!/usr/bin/env python
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# memcheck_analyze.py
+
+''' Given a valgrind XML file, parses errors and uniques them.'''
+
+import gdb_helper
+
+from collections import defaultdict
+import hashlib
+import logging
+import optparse
+import os
+import re
+import subprocess
+import sys
+import time
+from xml.dom.minidom import parse
+from xml.parsers.expat import ExpatError
+
+import common
+
+# Global symbol table (yuck)
+TheAddressTable = None
+
+# These are regexps that define functions (using C++ mangled names)
+# we don't want to see in stack traces while pretty printing
+# or generating suppressions.
+# Just stop printing the stack/suppression frames when the current one
+# matches any of these.
+_BORING_CALLERS = common.BoringCallers(mangled=True, use_re_wildcards=True)
+
+def getTextOf(top_node, name):
+ ''' Returns all text in all DOM nodes with a certain |name| that are children
+ of |top_node|.
+ '''
+
+ text = ""
+ for nodes_named in top_node.getElementsByTagName(name):
+ text += "".join([node.data for node in nodes_named.childNodes
+ if node.nodeType == node.TEXT_NODE])
+ return text
+
+def getCDATAOf(top_node, name):
+ ''' Returns all CDATA in all DOM nodes with a certain |name| that are children
+ of |top_node|.
+ '''
+
+ text = ""
+ for nodes_named in top_node.getElementsByTagName(name):
+ text += "".join([node.data for node in nodes_named.childNodes
+ if node.nodeType == node.CDATA_SECTION_NODE])
+ if (text == ""):
+ return None
+ return text
+
+def shortenFilePath(source_dir, directory):
+ '''Returns a string with the string prefix |source_dir| removed from
+ |directory|.'''
+ prefixes_to_cut = ["build/src/", "valgrind/coregrind/", "out/Release/../../"]
+
+ if source_dir:
+ prefixes_to_cut.append(source_dir)
+
+ for p in prefixes_to_cut:
+ index = directory.rfind(p)
+ if index != -1:
+ directory = directory[index + len(p):]
+
+ return directory
+
+# Constants that give real names to the abbreviations in valgrind XML output.
+INSTRUCTION_POINTER = "ip"
+OBJECT_FILE = "obj"
+FUNCTION_NAME = "fn"
+SRC_FILE_DIR = "dir"
+SRC_FILE_NAME = "file"
+SRC_LINE = "line"
+
+def gatherFrames(node, source_dir):
+ frames = []
+ for frame in node.getElementsByTagName("frame"):
+ frame_dict = {
+ INSTRUCTION_POINTER : getTextOf(frame, INSTRUCTION_POINTER),
+ OBJECT_FILE : getTextOf(frame, OBJECT_FILE),
+ FUNCTION_NAME : getTextOf(frame, FUNCTION_NAME),
+ SRC_FILE_DIR : shortenFilePath(
+ source_dir, getTextOf(frame, SRC_FILE_DIR)),
+ SRC_FILE_NAME : getTextOf(frame, SRC_FILE_NAME),
+ SRC_LINE : getTextOf(frame, SRC_LINE)
+ }
+
+ # Ignore this frame and all the following if it's a "boring" function.
+ enough_frames = False
+ for regexp in _BORING_CALLERS:
+ if re.match("^%s$" % regexp, frame_dict[FUNCTION_NAME]):
+ enough_frames = True
+ break
+ if enough_frames:
+ break
+
+ frames += [frame_dict]
+
+ global TheAddressTable
+ if TheAddressTable != None and frame_dict[SRC_LINE] == "":
+ # Try using gdb
+ TheAddressTable.Add(frame_dict[OBJECT_FILE],
+ frame_dict[INSTRUCTION_POINTER])
+ return frames
+
+class ValgrindError:
+ ''' Takes a <DOM Element: error> node and reads all the data from it. A
+ ValgrindError is immutable and is hashed on its pretty printed output.
+ '''
+
+ def __init__(self, source_dir, error_node, commandline, testcase):
+ ''' Copies all the relevant information out of the DOM and into object
+ properties.
+
+ Args:
+ error_node: The <error></error> DOM node we're extracting from.
+ source_dir: Prefix that should be stripped from the <dir> node.
+ commandline: The command that was run under valgrind
+ testcase: The test case name, if known.
+ '''
+
+ # Valgrind errors contain one <what><stack> pair, plus an optional
+ # <auxwhat><stack> pair, plus an optional <origin><what><stack></origin>,
+ # plus (since 3.5.0) a <suppression></suppression> pair.
+ # (Origin is nicely enclosed; too bad the other two aren't.)
+ # The most common way to see all three in one report is
+ # a syscall with a parameter that points to uninitialized memory, e.g.
+ # Format:
+ # <error>
+ # <unique>0x6d</unique>
+ # <tid>1</tid>
+ # <kind>SyscallParam</kind>
+ # <what>Syscall param write(buf) points to uninitialised byte(s)</what>
+ # <stack>
+ # <frame>
+ # ...
+ # </frame>
+ # </stack>
+ # <auxwhat>Address 0x5c9af4f is 7 bytes inside a block of ...</auxwhat>
+ # <stack>
+ # <frame>
+ # ...
+ # </frame>
+ # </stack>
+ # <origin>
+ # <what>Uninitialised value was created by a heap allocation</what>
+ # <stack>
+ # <frame>
+ # ...
+ # </frame>
+ # </stack>
+ # </origin>
+ # <suppression>
+ # <sname>insert_a_suppression_name_here</sname>
+ # <skind>Memcheck:Param</skind>
+ # <skaux>write(buf)</skaux>
+ # <sframe> <fun>__write_nocancel</fun> </sframe>
+ # ...
+ # <sframe> <fun>main</fun> </sframe>
+ # <rawtext>
+ # <![CDATA[
+ # {
+ # <insert_a_suppression_name_here>
+ # Memcheck:Param
+ # write(buf)
+ # fun:__write_nocancel
+ # ...
+ # fun:main
+ # }
+ # ]]>
+ # </rawtext>
+ # </suppression>
+ # </error>
+ #
+ # Each frame looks like this:
+ # <frame>
+ # <ip>0x83751BC</ip>
+ # <obj>/data/dkegel/chrome-build/src/out/Release/base_unittests</obj>
+ # <fn>_ZN7testing8internal12TestInfoImpl7RunTestEPNS_8TestInfoE</fn>
+ # <dir>/data/dkegel/chrome-build/src/testing/gtest/src</dir>
+ # <file>gtest-internal-inl.h</file>
+ # <line>655</line>
+ # </frame>
+ # although the dir, file, and line elements are missing if there is
+ # no debug info.
+
+ self._kind = getTextOf(error_node, "kind")
+ self._backtraces = []
+ self._suppression = None
+ self._commandline = commandline
+ self._testcase = testcase
+ self._additional = []
+
+ # Iterate through the nodes, parsing <what|auxwhat><stack> pairs.
+ description = None
+ for node in error_node.childNodes:
+ if node.localName == "what" or node.localName == "auxwhat":
+ description = "".join([n.data for n in node.childNodes
+ if n.nodeType == n.TEXT_NODE])
+ elif node.localName == "xwhat":
+ description = getTextOf(node, "text")
+ elif node.localName == "stack":
+ assert description
+ self._backtraces.append([description, gatherFrames(node, source_dir)])
+ description = None
+ elif node.localName == "origin":
+ description = getTextOf(node, "what")
+ stack = node.getElementsByTagName("stack")[0]
+ frames = gatherFrames(stack, source_dir)
+ self._backtraces.append([description, frames])
+ description = None
+ stack = None
+ frames = None
+ elif description and node.localName != None:
+ # The lastest description has no stack, e.g. "Address 0x28 is unknown"
+ self._additional.append(description)
+ description = None
+
+ if node.localName == "suppression":
+ self._suppression = getCDATAOf(node, "rawtext");
+
+ def __str__(self):
+ ''' Pretty print the type and backtrace(s) of this specific error,
+ including suppression (which is just a mangled backtrace).'''
+ output = ""
+ output += "\n" # Make sure the ### is at the beginning of line.
+ output += "### BEGIN MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
+ self.ErrorHash()
+ if (self._commandline):
+ output += self._commandline + "\n"
+
+ output += self._kind + "\n"
+ for backtrace in self._backtraces:
+ output += backtrace[0] + "\n"
+ filter = subprocess.Popen("c++filt -n", stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ shell=True,
+ close_fds=True)
+ buf = ""
+ for frame in backtrace[1]:
+ buf += (frame[FUNCTION_NAME] or frame[INSTRUCTION_POINTER]) + "\n"
+ (stdoutbuf, stderrbuf) = filter.communicate(buf.encode('latin-1'))
+ demangled_names = stdoutbuf.split("\n")
+
+ i = 0
+ for frame in backtrace[1]:
+ output += (" " + demangled_names[i])
+ i = i + 1
+
+ global TheAddressTable
+ if TheAddressTable != None and frame[SRC_FILE_DIR] == "":
+ # Try using gdb
+ foo = TheAddressTable.GetFileLine(frame[OBJECT_FILE],
+ frame[INSTRUCTION_POINTER])
+ if foo[0] != None:
+ output += (" (" + foo[0] + ":" + foo[1] + ")")
+ elif frame[SRC_FILE_DIR] != "":
+ output += (" (" + frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME] +
+ ":" + frame[SRC_LINE] + ")")
+ else:
+ output += " (" + frame[OBJECT_FILE] + ")"
+ output += "\n"
+
+ for additional in self._additional:
+ output += additional + "\n"
+
+ assert self._suppression != None, "Your Valgrind doesn't generate " \
+ "suppressions - is it too old?"
+
+ if self._testcase:
+ output += "The report came from the `%s` test.\n" % self._testcase
+ output += "Suppression (error hash=#%016X#):\n" % self.ErrorHash()
+ output += (" For more info on using suppressions see "
+ "http://dev.chromium.org/developers/tree-sheriffs/sheriff-details-chromium/memory-sheriff#TOC-Suppressing-memory-reports")
+
+ # Widen suppression slightly to make portable between mac and linux
+ # TODO(timurrrr): Oops, these transformations should happen
+ # BEFORE calculating the hash!
+ supp = self._suppression;
+ supp = supp.replace("fun:_Znwj", "fun:_Znw*")
+ supp = supp.replace("fun:_Znwm", "fun:_Znw*")
+ supp = supp.replace("fun:_Znaj", "fun:_Zna*")
+ supp = supp.replace("fun:_Znam", "fun:_Zna*")
+
+ # Make suppressions even less platform-dependent.
+ for sz in [1, 2, 4, 8]:
+ supp = supp.replace("Memcheck:Addr%d" % sz, "Memcheck:Unaddressable")
+ supp = supp.replace("Memcheck:Value%d" % sz, "Memcheck:Uninitialized")
+ supp = supp.replace("Memcheck:Cond", "Memcheck:Uninitialized")
+
+ # Split into lines so we can enforce length limits
+ supplines = supp.split("\n")
+ supp = None # to avoid re-use
+
+ # Truncate at line 26 (VG_MAX_SUPP_CALLERS plus 2 for name and type)
+ # or at the first 'boring' caller.
+ # (https://bugs.kde.org/show_bug.cgi?id=199468 proposes raising
+ # VG_MAX_SUPP_CALLERS, but we're probably fine with it as is.)
+ newlen = min(26, len(supplines));
+
+ # Drop boring frames and all the following.
+ enough_frames = False
+ for frameno in range(newlen):
+ for boring_caller in _BORING_CALLERS:
+ if re.match("^ +fun:%s$" % boring_caller, supplines[frameno]):
+ newlen = frameno
+ enough_frames = True
+ break
+ if enough_frames:
+ break
+ if (len(supplines) > newlen):
+ supplines = supplines[0:newlen]
+ supplines.append("}")
+
+ for frame in range(len(supplines)):
+ # Replace the always-changing anonymous namespace prefix with "*".
+ m = re.match("( +fun:)_ZN.*_GLOBAL__N_.*\.cc_" +
+ "[0-9a-fA-F]{8}_[0-9a-fA-F]{8}(.*)",
+ supplines[frame])
+ if m:
+ supplines[frame] = "*".join(m.groups())
+
+ output += "\n".join(supplines) + "\n"
+ output += "### END MEMORY TOOL REPORT (error hash=#%016X#)\n" % \
+ self.ErrorHash()
+
+ return output
+
+ def UniqueString(self):
+ ''' String to use for object identity. Don't print this, use str(obj)
+ instead.'''
+ rep = self._kind + " "
+ for backtrace in self._backtraces:
+ for frame in backtrace[1]:
+ rep += frame[FUNCTION_NAME]
+
+ if frame[SRC_FILE_DIR] != "":
+ rep += frame[SRC_FILE_DIR] + "/" + frame[SRC_FILE_NAME]
+ else:
+ rep += frame[OBJECT_FILE]
+
+ return rep
+
+ # This is a device-independent hash identifying the suppression.
+ # By printing out this hash we can find duplicate reports between tests and
+ # different shards running on multiple buildbots
+ def ErrorHash(self):
+ return int(hashlib.md5(self.UniqueString()).hexdigest()[:16], 16)
+
+ def __hash__(self):
+ return hash(self.UniqueString())
+ def __eq__(self, rhs):
+ return self.UniqueString() == rhs
+
+def log_is_finished(f, force_finish):
+ f.seek(0)
+ prev_line = ""
+ while True:
+ line = f.readline()
+ if line == "":
+ if not force_finish:
+ return False
+ # Okay, the log is not finished but we can make it up to be parseable:
+ if prev_line.strip() in ["</error>", "</errorcounts>", "</status>"]:
+ f.write("</valgrindoutput>\n")
+ return True
+ return False
+ if '</valgrindoutput>' in line:
+ # Valgrind often has garbage after </valgrindoutput> upon crash.
+ f.truncate()
+ return True
+ prev_line = line
+
+class MemcheckAnalyzer:
+ ''' Given a set of Valgrind XML files, parse all the errors out of them,
+ unique them and output the results.'''
+
+ SANITY_TEST_SUPPRESSIONS = {
+ "Memcheck sanity test 01 (memory leak).": 1,
+ "Memcheck sanity test 02 (malloc/read left).": 1,
+ "Memcheck sanity test 03 (malloc/read right).": 1,
+ "Memcheck sanity test 04 (malloc/write left).": 1,
+ "Memcheck sanity test 05 (malloc/write right).": 1,
+ "Memcheck sanity test 06 (new/read left).": 1,
+ "Memcheck sanity test 07 (new/read right).": 1,
+ "Memcheck sanity test 08 (new/write left).": 1,
+ "Memcheck sanity test 09 (new/write right).": 1,
+ "Memcheck sanity test 10 (write after free).": 1,
+ "Memcheck sanity test 11 (write after delete).": 1,
+ "Memcheck sanity test 12 (array deleted without []).": 1,
+ "Memcheck sanity test 13 (single element deleted with []).": 1,
+ "Memcheck sanity test 14 (malloc/read uninit).": 1,
+ "Memcheck sanity test 15 (new/read uninit).": 1,
+ }
+
+ # Max time to wait for memcheck logs to complete.
+ LOG_COMPLETION_TIMEOUT = 180.0
+
+ def __init__(self, source_dir, show_all_leaks=False, use_gdb=False):
+ '''Create a parser for Memcheck logs.
+
+ Args:
+ source_dir: Path to top of source tree for this build
+ show_all_leaks: Whether to show even less important leaks
+ use_gdb: Whether to use gdb to resolve source filenames and line numbers
+ in the report stacktraces
+ '''
+ self._source_dir = source_dir
+ self._show_all_leaks = show_all_leaks
+ self._use_gdb = use_gdb
+
+ # Contains the set of unique errors
+ self._errors = set()
+
+ # Contains the time when the we started analyzing the first log file.
+ # This variable is used to skip incomplete logs after some timeout.
+ self._analyze_start_time = None
+
+
+ def Report(self, files, testcase, check_sanity=False):
+ '''Reads in a set of files and prints Memcheck report.
+
+ Args:
+ files: A list of filenames.
+ check_sanity: if true, search for SANITY_TEST_SUPPRESSIONS
+ '''
+ # Beyond the detailed errors parsed by ValgrindError above,
+ # the xml file contain records describing suppressions that were used:
+ # <suppcounts>
+ # <pair>
+ # <count>28</count>
+ # <name>pango_font_leak_todo</name>
+ # </pair>
+ # <pair>
+ # <count>378</count>
+ # <name>bug_13243</name>
+ # </pair>
+ # </suppcounts
+ # Collect these and print them at the end.
+ #
+ # With our patch for https://bugs.kde.org/show_bug.cgi?id=205000 in,
+ # the file also includes records of the form
+ # <load_obj><obj>/usr/lib/libgcc_s.1.dylib</obj><ip>0x27000</ip></load_obj>
+ # giving the filename and load address of each binary that was mapped
+ # into the process.
+
+ global TheAddressTable
+ if self._use_gdb:
+ TheAddressTable = gdb_helper.AddressTable()
+ else:
+ TheAddressTable = None
+ cur_report_errors = set()
+ suppcounts = defaultdict(int)
+ badfiles = set()
+
+ if self._analyze_start_time == None:
+ self._analyze_start_time = time.time()
+ start_time = self._analyze_start_time
+
+ parse_failed = False
+ for file in files:
+ # Wait up to three minutes for valgrind to finish writing all files,
+ # but after that, just skip incomplete files and warn.
+ f = open(file, "r+")
+ pid = re.match(".*\.([0-9]+)$", file)
+ if pid:
+ pid = pid.groups()[0]
+ found = False
+ running = True
+ firstrun = True
+ skip = False
+ origsize = os.path.getsize(file)
+ while (running and not found and not skip and
+ (firstrun or
+ ((time.time() - start_time) < self.LOG_COMPLETION_TIMEOUT))):
+ firstrun = False
+ f.seek(0)
+ if pid:
+ # Make sure the process is still running so we don't wait for
+ # 3 minutes if it was killed. See http://crbug.com/17453
+ ps_out = subprocess.Popen("ps p %s" % pid, shell=True,
+ stdout=subprocess.PIPE).stdout
+ if len(ps_out.readlines()) < 2:
+ running = False
+ else:
+ skip = True
+ running = False
+ found = log_is_finished(f, False)
+ if not running and not found:
+ logging.warn("Valgrind process PID = %s is not running but its "
+ "XML log has not been finished correctly.\n"
+ "Make it up by adding some closing tags manually." % pid)
+ found = log_is_finished(f, not running)
+ if running and not found:
+ time.sleep(1)
+ f.close()
+ if not found:
+ badfiles.add(file)
+ else:
+ newsize = os.path.getsize(file)
+ if origsize > newsize+1:
+ logging.warn(str(origsize - newsize) +
+ " bytes of junk were after </valgrindoutput> in %s!" %
+ file)
+ try:
+ parsed_file = parse(file);
+ except ExpatError, e:
+ parse_failed = True
+ logging.warn("could not parse %s: %s" % (file, e))
+ lineno = e.lineno - 1
+ context_lines = 5
+ context_start = max(0, lineno - context_lines)
+ context_end = lineno + context_lines + 1
+ context_file = open(file, "r")
+ for i in range(0, context_start):
+ context_file.readline()
+ for i in range(context_start, context_end):
+ context_data = context_file.readline().rstrip()
+ if i != lineno:
+ logging.warn(" %s" % context_data)
+ else:
+ logging.warn("> %s" % context_data)
+ context_file.close()
+ continue
+ if TheAddressTable != None:
+ load_objs = parsed_file.getElementsByTagName("load_obj")
+ for load_obj in load_objs:
+ obj = getTextOf(load_obj, "obj")
+ ip = getTextOf(load_obj, "ip")
+ TheAddressTable.AddBinaryAt(obj, ip)
+
+ commandline = None
+ preamble = parsed_file.getElementsByTagName("preamble")[0];
+ for node in preamble.getElementsByTagName("line"):
+ if node.localName == "line":
+ for x in node.childNodes:
+ if x.nodeType == node.TEXT_NODE and "Command" in x.data:
+ commandline = x.data
+ break
+
+ raw_errors = parsed_file.getElementsByTagName("error")
+ for raw_error in raw_errors:
+ # Ignore "possible" leaks for now by default.
+ if (self._show_all_leaks or
+ getTextOf(raw_error, "kind") != "Leak_PossiblyLost"):
+ error = ValgrindError(self._source_dir,
+ raw_error, commandline, testcase)
+ if error not in cur_report_errors:
+ # We haven't seen such errors doing this report yet...
+ if error in self._errors:
+ # ... but we saw it in earlier reports, e.g. previous UI test
+ cur_report_errors.add("This error was already printed in "
+ "some other test, see 'hash=#%016X#'" % \
+ error.ErrorHash())
+ else:
+ # ... and we haven't seen it in other tests as well
+ self._errors.add(error)
+ cur_report_errors.add(error)
+
+ suppcountlist = parsed_file.getElementsByTagName("suppcounts")
+ if len(suppcountlist) > 0:
+ suppcountlist = suppcountlist[0]
+ for node in suppcountlist.getElementsByTagName("pair"):
+ count = getTextOf(node, "count");
+ name = getTextOf(node, "name");
+ suppcounts[name] += int(count)
+
+ if len(badfiles) > 0:
+ logging.warn("valgrind didn't finish writing %d files?!" % len(badfiles))
+ for file in badfiles:
+ logging.warn("Last 20 lines of %s :" % file)
+ os.system("tail -n 20 '%s' 1>&2" % file)
+
+ if parse_failed:
+ logging.error("FAIL! Couldn't parse Valgrind output file")
+ return -2
+
+ common.PrintUsedSuppressionsList(suppcounts)
+
+ retcode = 0
+ if cur_report_errors:
+ logging.error("FAIL! There were %s errors: " % len(cur_report_errors))
+
+ if TheAddressTable != None:
+ TheAddressTable.ResolveAll()
+
+ for error in cur_report_errors:
+ logging.error(error)
+
+ retcode = -1
+
+ # Report tool's insanity even if there were errors.
+ if check_sanity:
+ remaining_sanity_supp = MemcheckAnalyzer.SANITY_TEST_SUPPRESSIONS
+ for (name, count) in suppcounts.iteritems():
+ # Workaround for http://crbug.com/334074
+ if (name in remaining_sanity_supp and
+ remaining_sanity_supp[name] <= count):
+ del remaining_sanity_supp[name]
+ if remaining_sanity_supp:
+ logging.error("FAIL! Sanity check failed!")
+ logging.info("The following test errors were not handled: ")
+ for (name, count) in remaining_sanity_supp.iteritems():
+ logging.info(" * %dx %s" % (count, name))
+ retcode = -3
+
+ if retcode != 0:
+ return retcode
+
+ logging.info("PASS! No errors found!")
+ return 0
+
+
+def _main():
+ '''For testing only. The MemcheckAnalyzer class should be imported instead.'''
+ parser = optparse.OptionParser("usage: %prog [options] <files to analyze>")
+ parser.add_option("", "--source-dir",
+ help="path to top of source tree for this build"
+ "(used to normalize source paths in baseline)")
+
+ (options, args) = parser.parse_args()
+ if len(args) == 0:
+ parser.error("no filename specified")
+ filenames = args
+
+ analyzer = MemcheckAnalyzer(options.source_dir, use_gdb=True)
+ return analyzer.Report(filenames, None)
+
+
+if __name__ == "__main__":
+ sys.exit(_main())
diff --git a/files/tools_libyuv/valgrind/valgrind.sh b/files/tools_libyuv/valgrind/valgrind.sh
new file mode 100755
index 0000000..7f3f792
--- /dev/null
+++ b/files/tools_libyuv/valgrind/valgrind.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright (c) 2017 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# This is a small script for manually launching valgrind, along with passing
+# it the suppression file, and some helpful arguments (automatically attaching
+# the debugger on failures, etc). Run it from your repo root, something like:
+# $ sh ./tools/valgrind/valgrind.sh ./out/Debug/chrome
+#
+# This is mostly intended for running the chrome browser interactively.
+# To run unit tests, you probably want to run chrome_tests.sh instead.
+# That's the script used by the valgrind buildbot.
+
+export THISDIR=`dirname $0`
+
+setup_memcheck() {
+ RUN_COMMAND="valgrind"
+
+ # Prompt to attach gdb when there was an error detected.
+ DEFAULT_TOOL_FLAGS=("--db-command=gdb -nw %f %p" "--db-attach=yes" \
+ # Keep the registers in gdb in sync with the code.
+ "--vex-iropt-register-updates=allregs-at-mem-access" \
+ # Overwrite newly allocated or freed objects
+ # with 0x41 to catch inproper use.
+ "--malloc-fill=41" "--free-fill=41" \
+ # Increase the size of stacks being tracked.
+ "--num-callers=30")
+}
+
+setup_unknown() {
+ echo "Unknown tool \"$TOOL_NAME\" specified, the result is not guaranteed"
+ DEFAULT_TOOL_FLAGS=()
+}
+
+set -e
+
+if [ $# -eq 0 ]; then
+ echo "usage: <command to run> <arguments ...>"
+ exit 1
+fi
+
+TOOL_NAME="memcheck"
+declare -a DEFAULT_TOOL_FLAGS[0]
+
+# Select a tool different from memcheck with --tool=TOOL as a first argument
+TMP_STR=`echo $1 | sed 's/^\-\-tool=//'`
+if [ "$TMP_STR" != "$1" ]; then
+ TOOL_NAME="$TMP_STR"
+ shift
+fi
+
+if echo "$@" | grep "\-\-tool" ; then
+ echo "--tool=TOOL must be the first argument" >&2
+ exit 1
+fi
+
+case $TOOL_NAME in
+ memcheck*) setup_memcheck "$1";;
+ *) setup_unknown;;
+esac
+
+
+SUPPRESSIONS="$THISDIR/$TOOL_NAME/suppressions.txt"
+
+CHROME_VALGRIND=`sh $THISDIR/locate_valgrind.sh`
+if [ "$CHROME_VALGRIND" = "" ]
+then
+ # locate_valgrind.sh failed
+ exit 1
+fi
+echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+set -x
+PATH="${CHROME_VALGRIND}/bin:$PATH"
+# We need to set these variables to override default lib paths hard-coded into
+# Valgrind binary.
+export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+# G_SLICE=always-malloc: make glib use system malloc
+# NSS_DISABLE_UNLOAD=1: make nss skip dlclosing dynamically loaded modules,
+# which would result in "obj:*" in backtraces.
+# NSS_DISABLE_ARENA_FREE_LIST=1: make nss use system malloc
+# G_DEBUG=fatal_warnings: make GTK abort on any critical or warning assertions.
+# If it crashes on you in the Options menu, you hit bug 19751,
+# comment out the G_DEBUG=fatal_warnings line.
+#
+# GTEST_DEATH_TEST_USE_FORK=1: make gtest death tests valgrind-friendly
+#
+# When everyone has the latest valgrind, we might want to add
+# --show-possibly-lost=no
+# to ignore possible but not definite leaks.
+
+G_SLICE=always-malloc \
+NSS_DISABLE_UNLOAD=1 \
+NSS_DISABLE_ARENA_FREE_LIST=1 \
+G_DEBUG=fatal_warnings \
+GTEST_DEATH_TEST_USE_FORK=1 \
+$RUN_COMMAND \
+ --trace-children=yes \
+ --leak-check=yes \
+ --suppressions="$SUPPRESSIONS" \
+ "${DEFAULT_TOOL_FLAGS[@]}" \
+ "$@"
diff --git a/files/tools_libyuv/valgrind/valgrind_test.py b/files/tools_libyuv/valgrind/valgrind_test.py
new file mode 100755
index 0000000..0fd3d97
--- /dev/null
+++ b/files/tools_libyuv/valgrind/valgrind_test.py
@@ -0,0 +1,517 @@
+#!/usr/bin/env python
+# Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Runs an exe through Valgrind and puts the intermediate files in a
+directory.
+"""
+
+import datetime
+import glob
+import logging
+import optparse
+import os
+import re
+import shutil
+import stat
+import subprocess
+import sys
+import tempfile
+
+import common
+
+import memcheck_analyze
+
+class BaseTool(object):
+ """Abstract class for running dynamic error detection tools.
+
+ Always subclass this and implement ToolCommand with framework- and
+ tool-specific stuff.
+ """
+
+ def __init__(self):
+ temp_parent_dir = None
+ self.log_parent_dir = ""
+ if common.IsWindows():
+ # gpu process on Windows Vista+ runs at Low Integrity and can only
+ # write to certain directories (http://crbug.com/119131)
+ #
+ # TODO(bruening): if scripts die in middle and don't clean up temp
+ # dir, we'll accumulate files in profile dir. should remove
+ # really old files automatically.
+ profile = os.getenv("USERPROFILE")
+ if profile:
+ self.log_parent_dir = profile + "\\AppData\\LocalLow\\"
+ if os.path.exists(self.log_parent_dir):
+ self.log_parent_dir = common.NormalizeWindowsPath(self.log_parent_dir)
+ temp_parent_dir = self.log_parent_dir
+ # Generated every time (even when overridden)
+ self.temp_dir = tempfile.mkdtemp(prefix="vg_logs_", dir=temp_parent_dir)
+ self.log_dir = self.temp_dir # overridable by --keep_logs
+ self.option_parser_hooks = []
+ # TODO(glider): we may not need some of the env vars on some of the
+ # platforms.
+ self._env = {
+ "G_SLICE" : "always-malloc",
+ "NSS_DISABLE_UNLOAD" : "1",
+ "NSS_DISABLE_ARENA_FREE_LIST" : "1",
+ "GTEST_DEATH_TEST_USE_FORK": "1",
+ }
+
+ def ToolName(self):
+ raise NotImplementedError, "This method should be implemented " \
+ "in the tool-specific subclass"
+
+ def Analyze(self, check_sanity=False):
+ raise NotImplementedError, "This method should be implemented " \
+ "in the tool-specific subclass"
+
+ def RegisterOptionParserHook(self, hook):
+ # Frameworks and tools can add their own flags to the parser.
+ self.option_parser_hooks.append(hook)
+
+ def CreateOptionParser(self):
+ # Defines Chromium-specific flags.
+ self._parser = optparse.OptionParser("usage: %prog [options] <program to "
+ "test>")
+ self._parser.disable_interspersed_args()
+ self._parser.add_option("-t", "--timeout",
+ dest="timeout", metavar="TIMEOUT", default=10000,
+ help="timeout in seconds for the run (default 10000)")
+ self._parser.add_option("", "--build-dir",
+ help="the location of the compiler output")
+ self._parser.add_option("", "--source-dir",
+ help="path to top of source tree for this build"
+ "(used to normalize source paths in baseline)")
+ self._parser.add_option("", "--gtest_filter", default="",
+ help="which test case to run")
+ self._parser.add_option("", "--gtest_repeat",
+ help="how many times to run each test")
+ self._parser.add_option("", "--gtest_print_time", action="store_true",
+ default=False,
+ help="show how long each test takes")
+ self._parser.add_option("", "--ignore_exit_code", action="store_true",
+ default=False,
+ help="ignore exit code of the test "
+ "(e.g. test failures)")
+ self._parser.add_option("", "--keep_logs", action="store_true",
+ default=False,
+ help="store memory tool logs in the <tool>.logs "
+ "directory instead of /tmp.\nThis can be "
+ "useful for tool developers/maintainers.\n"
+ "Please note that the <tool>.logs directory "
+ "will be clobbered on tool startup.")
+
+ # To add framework- or tool-specific flags, please add a hook using
+ # RegisterOptionParserHook in the corresponding subclass.
+ # See ValgrindTool for an example.
+ for hook in self.option_parser_hooks:
+ hook(self, self._parser)
+
+ def ParseArgv(self, args):
+ self.CreateOptionParser()
+
+ # self._tool_flags will store those tool flags which we don't parse
+ # manually in this script.
+ self._tool_flags = []
+ known_args = []
+
+ """ We assume that the first argument not starting with "-" is a program
+ name and all the following flags should be passed to the program.
+ TODO(timurrrr): customize optparse instead
+ """
+ while len(args) > 0 and args[0][:1] == "-":
+ arg = args[0]
+ if (arg == "--"):
+ break
+ if self._parser.has_option(arg.split("=")[0]):
+ known_args += [arg]
+ else:
+ self._tool_flags += [arg]
+ args = args[1:]
+
+ if len(args) > 0:
+ known_args += args
+
+ self._options, self._args = self._parser.parse_args(known_args)
+
+ self._timeout = int(self._options.timeout)
+ self._source_dir = self._options.source_dir
+ if self._options.keep_logs:
+ # log_parent_dir has trailing slash if non-empty
+ self.log_dir = self.log_parent_dir + "%s.logs" % self.ToolName()
+ if os.path.exists(self.log_dir):
+ shutil.rmtree(self.log_dir)
+ os.mkdir(self.log_dir)
+ logging.info("Logs are in " + self.log_dir)
+
+ self._ignore_exit_code = self._options.ignore_exit_code
+ if self._options.gtest_filter != "":
+ self._args.append("--gtest_filter=%s" % self._options.gtest_filter)
+ if self._options.gtest_repeat:
+ self._args.append("--gtest_repeat=%s" % self._options.gtest_repeat)
+ if self._options.gtest_print_time:
+ self._args.append("--gtest_print_time")
+
+ return True
+
+ def Setup(self, args):
+ return self.ParseArgv(args)
+
+ def ToolCommand(self):
+ raise NotImplementedError, "This method should be implemented " \
+ "in the tool-specific subclass"
+
+ def Cleanup(self):
+ # You may override it in the tool-specific subclass
+ pass
+
+ def Execute(self):
+ """ Execute the app to be tested after successful instrumentation.
+ Full execution command-line provided by subclassers via proc."""
+ logging.info("starting execution...")
+ proc = self.ToolCommand()
+ for var in self._env:
+ common.PutEnvAndLog(var, self._env[var])
+ return common.RunSubprocess(proc, self._timeout)
+
+ def RunTestsAndAnalyze(self, check_sanity):
+ exec_retcode = self.Execute()
+ analyze_retcode = self.Analyze(check_sanity)
+
+ if analyze_retcode:
+ logging.error("Analyze failed.")
+ logging.info("Search the log for '[ERROR]' to see the error reports.")
+ return analyze_retcode
+
+ if exec_retcode:
+ if self._ignore_exit_code:
+ logging.info("Test execution failed, but the exit code is ignored.")
+ else:
+ logging.error("Test execution failed.")
+ return exec_retcode
+ else:
+ logging.info("Test execution completed successfully.")
+
+ if not analyze_retcode:
+ logging.info("Analysis completed successfully.")
+
+ return 0
+
+ def Main(self, args, check_sanity, min_runtime_in_seconds):
+ """Call this to run through the whole process: Setup, Execute, Analyze"""
+ start_time = datetime.datetime.now()
+ retcode = -1
+ if self.Setup(args):
+ retcode = self.RunTestsAndAnalyze(check_sanity)
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
+ self.Cleanup()
+ else:
+ logging.error("Setup failed")
+ end_time = datetime.datetime.now()
+ runtime_in_seconds = (end_time - start_time).seconds
+ hours = runtime_in_seconds / 3600
+ seconds = runtime_in_seconds % 3600
+ minutes = seconds / 60
+ seconds = seconds % 60
+ logging.info("elapsed time: %02d:%02d:%02d" % (hours, minutes, seconds))
+ if (min_runtime_in_seconds > 0 and
+ runtime_in_seconds < min_runtime_in_seconds):
+ logging.error("Layout tests finished too quickly. "
+ "It should have taken at least %d seconds. "
+ "Something went wrong?" % min_runtime_in_seconds)
+ retcode = -1
+ return retcode
+
+ def Run(self, args, module, min_runtime_in_seconds=0):
+ MODULES_TO_SANITY_CHECK = ["base"]
+
+ check_sanity = module in MODULES_TO_SANITY_CHECK
+ return self.Main(args, check_sanity, min_runtime_in_seconds)
+
+
+class ValgrindTool(BaseTool):
+ """Abstract class for running Valgrind tools.
+
+ Always subclass this and implement ToolSpecificFlags() and
+ ExtendOptionParser() for tool-specific stuff.
+ """
+ def __init__(self):
+ super(ValgrindTool, self).__init__()
+ self.RegisterOptionParserHook(ValgrindTool.ExtendOptionParser)
+
+ def UseXML(self):
+ # Override if tool prefers nonxml output
+ return True
+
+ def ExtendOptionParser(self, parser):
+ parser.add_option("", "--suppressions", default=[],
+ action="append",
+ help="path to a valgrind suppression file")
+ parser.add_option("", "--indirect", action="store_true",
+ default=False,
+ help="set BROWSER_WRAPPER rather than "
+ "running valgrind directly")
+ parser.add_option("", "--indirect_webkit_layout", action="store_true",
+ default=False,
+ help="set --wrapper rather than running Dr. Memory "
+ "directly.")
+ parser.add_option("", "--trace_children", action="store_true",
+ default=False,
+ help="also trace child processes")
+ parser.add_option("", "--num-callers",
+ dest="num_callers", default=30,
+ help="number of callers to show in stack traces")
+ parser.add_option("", "--generate_dsym", action="store_true",
+ default=False,
+ help="Generate .dSYM file on Mac if needed. Slow!")
+
+ def Setup(self, args):
+ if not BaseTool.Setup(self, args):
+ return False
+ return True
+
+ def ToolCommand(self):
+ """Get the valgrind command to run."""
+ # Note that self._args begins with the exe to be run.
+ tool_name = self.ToolName()
+
+ # Construct the valgrind command.
+ if 'CHROME_VALGRIND' in os.environ:
+ path = os.path.join(os.environ['CHROME_VALGRIND'], "bin", "valgrind")
+ else:
+ path = "valgrind"
+ proc = [path, "--tool=%s" % tool_name]
+
+ proc += ["--num-callers=%i" % int(self._options.num_callers)]
+
+ if self._options.trace_children:
+ proc += ["--trace-children=yes"]
+ proc += ["--trace-children-skip='*dbus-daemon*'"]
+ proc += ["--trace-children-skip='*dbus-launch*'"]
+ proc += ["--trace-children-skip='*perl*'"]
+ proc += ["--trace-children-skip='*python*'"]
+ # This is really Python, but for some reason Valgrind follows it.
+ proc += ["--trace-children-skip='*lsb_release*'"]
+
+ proc += self.ToolSpecificFlags()
+ proc += self._tool_flags
+
+ suppression_count = 0
+ for suppression_file in self._options.suppressions:
+ if os.path.exists(suppression_file):
+ suppression_count += 1
+ proc += ["--suppressions=%s" % suppression_file]
+
+ if not suppression_count:
+ logging.warning("WARNING: NOT USING SUPPRESSIONS!")
+
+ logfilename = self.log_dir + ("/%s." % tool_name) + "%p"
+ if self.UseXML():
+ proc += ["--xml=yes", "--xml-file=" + logfilename]
+ else:
+ proc += ["--log-file=" + logfilename]
+
+ # The Valgrind command is constructed.
+
+ # Handle --indirect_webkit_layout separately.
+ if self._options.indirect_webkit_layout:
+ # Need to create the wrapper before modifying |proc|.
+ wrapper = self.CreateBrowserWrapper(proc, webkit=True)
+ proc = self._args
+ proc.append("--wrapper")
+ proc.append(wrapper)
+ return proc
+
+ if self._options.indirect:
+ wrapper = self.CreateBrowserWrapper(proc)
+ os.environ["BROWSER_WRAPPER"] = wrapper
+ logging.info('export BROWSER_WRAPPER=' + wrapper)
+ proc = []
+ proc += self._args
+ return proc
+
+ def ToolSpecificFlags(self):
+ raise NotImplementedError, "This method should be implemented " \
+ "in the tool-specific subclass"
+
+ def CreateBrowserWrapper(self, proc, webkit=False):
+ """The program being run invokes Python or something else that can't stand
+ to be valgrinded, and also invokes the Chrome browser. In this case, use a
+ magic wrapper to only valgrind the Chrome browser. Build the wrapper here.
+ Returns the path to the wrapper. It's up to the caller to use the wrapper
+ appropriately.
+ """
+ command = " ".join(proc)
+ # Add the PID of the browser wrapper to the logfile names so we can
+ # separate log files for different UI tests at the analyze stage.
+ command = command.replace("%p", "$$.%p")
+
+ (fd, indirect_fname) = tempfile.mkstemp(dir=self.log_dir,
+ prefix="browser_wrapper.",
+ text=True)
+ f = os.fdopen(fd, "w")
+ f.write('#!/bin/bash\n'
+ 'echo "Started Valgrind wrapper for this test, PID=$$" >&2\n')
+
+ f.write('DIR=`dirname $0`\n'
+ 'TESTNAME_FILE=$DIR/testcase.$$.name\n\n')
+
+ if webkit:
+ # Webkit layout_tests pass the URL as the first line of stdin.
+ f.write('tee $TESTNAME_FILE | %s "$@"\n' % command)
+ else:
+ # Try to get the test case name by looking at the program arguments.
+ # i.e. Chromium ui_tests used --test-name arg.
+ # TODO(timurrrr): This doesn't handle "--test-name Test.Name"
+ # TODO(timurrrr): ui_tests are dead. Where do we use the non-webkit
+ # wrapper now? browser_tests? What do they do?
+ f.write('for arg in $@\ndo\n'
+ ' if [[ "$arg" =~ --test-name=(.*) ]]\n then\n'
+ ' echo ${BASH_REMATCH[1]} >$TESTNAME_FILE\n'
+ ' fi\n'
+ 'done\n\n'
+ '%s "$@"\n' % command)
+
+ f.close()
+ os.chmod(indirect_fname, stat.S_IRUSR|stat.S_IXUSR)
+ return indirect_fname
+
+ def CreateAnalyzer(self):
+ raise NotImplementedError, "This method should be implemented " \
+ "in the tool-specific subclass"
+
+ def GetAnalyzeResults(self, check_sanity=False):
+ # Glob all the files in the log directory
+ filenames = glob.glob(self.log_dir + "/" + self.ToolName() + ".*")
+
+ # If we have browser wrapper, the logfiles are named as
+ # "toolname.wrapper_PID.valgrind_PID".
+ # Let's extract the list of wrapper_PIDs and name it ppids
+ ppids = set([int(f.split(".")[-2]) \
+ for f in filenames if re.search("\.[0-9]+\.[0-9]+$", f)])
+
+ analyzer = self.CreateAnalyzer()
+ if len(ppids) == 0:
+ # Fast path - no browser wrapper was set.
+ return analyzer.Report(filenames, None, check_sanity)
+
+ ret = 0
+ for ppid in ppids:
+ testcase_name = None
+ try:
+ f = open(self.log_dir + ("/testcase.%d.name" % ppid))
+ testcase_name = f.read().strip()
+ f.close()
+ wk_layout_prefix="third_party/WebKit/LayoutTests/"
+ wk_prefix_at = testcase_name.rfind(wk_layout_prefix)
+ if wk_prefix_at != -1:
+ testcase_name = testcase_name[wk_prefix_at + len(wk_layout_prefix):]
+ except IOError:
+ pass
+ print "====================================================="
+ print " Below is the report for valgrind wrapper PID=%d." % ppid
+ if testcase_name:
+ print " It was used while running the `%s` test." % testcase_name
+ else:
+ print " You can find the corresponding test"
+ print " by searching the above log for 'PID=%d'" % ppid
+ sys.stdout.flush()
+
+ ppid_filenames = [f for f in filenames \
+ if re.search("\.%d\.[0-9]+$" % ppid, f)]
+ # check_sanity won't work with browser wrappers
+ assert check_sanity == False
+ ret |= analyzer.Report(ppid_filenames, testcase_name)
+ print "====================================================="
+ sys.stdout.flush()
+
+ if ret != 0:
+ print ""
+ print "The Valgrind reports are grouped by test names."
+ print "Each test has its PID printed in the log when the test was run"
+ print "and at the beginning of its Valgrind report."
+ print "Hint: you can search for the reports by Ctrl+F -> `=#`"
+ sys.stdout.flush()
+
+ return ret
+
+
+# TODO(timurrrr): Split into a separate file.
+class Memcheck(ValgrindTool):
+ """Memcheck
+ Dynamic memory error detector for Linux & Mac
+
+ http://valgrind.org/info/tools.html#memcheck
+ """
+
+ def __init__(self):
+ super(Memcheck, self).__init__()
+ self.RegisterOptionParserHook(Memcheck.ExtendOptionParser)
+
+ def ToolName(self):
+ return "memcheck"
+
+ def ExtendOptionParser(self, parser):
+ parser.add_option("--leak-check", "--leak_check", type="string",
+ default="yes", # --leak-check=yes is equivalent of =full
+ help="perform leak checking at the end of the run")
+ parser.add_option("", "--show_all_leaks", action="store_true",
+ default=False,
+ help="also show less blatant leaks")
+ parser.add_option("", "--track_origins", action="store_true",
+ default=False,
+ help="Show whence uninitialized bytes came. 30% slower.")
+
+ def ToolSpecificFlags(self):
+ ret = ["--gen-suppressions=all", "--demangle=no"]
+ ret += ["--leak-check=%s" % self._options.leak_check]
+
+ if self._options.show_all_leaks:
+ ret += ["--show-reachable=yes"]
+ else:
+ ret += ["--show-possibly-lost=no"]
+
+ if self._options.track_origins:
+ ret += ["--track-origins=yes"]
+
+ # TODO(glider): this is a temporary workaround for http://crbug.com/51716
+ # Let's see whether it helps.
+ if common.IsMac():
+ ret += ["--smc-check=all"]
+
+ return ret
+
+ def CreateAnalyzer(self):
+ use_gdb = common.IsMac()
+ return memcheck_analyze.MemcheckAnalyzer(self._source_dir,
+ self._options.show_all_leaks,
+ use_gdb=use_gdb)
+
+ def Analyze(self, check_sanity=False):
+ ret = self.GetAnalyzeResults(check_sanity)
+
+ if ret != 0:
+ logging.info("Please see http://dev.chromium.org/developers/how-tos/"
+ "using-valgrind for the info on Memcheck/Valgrind")
+ return ret
+
+
+class ToolFactory:
+ def Create(self, tool_name):
+ if tool_name == "memcheck":
+ return Memcheck()
+ try:
+ platform_name = common.PlatformNames()[0]
+ except common.NotImplementedError:
+ platform_name = sys.platform + "(Unknown)"
+ raise RuntimeError, "Unknown tool (tool=%s, platform=%s)" % (tool_name,
+ platform_name)
+
+def CreateTool(tool):
+ return ToolFactory().Create(tool)
diff --git a/files/unit_test/basictypes_test.cc b/files/unit_test/basictypes_test.cc
index 89f7644..9aaa2dc 100644
--- a/files/unit_test/basictypes_test.cc
+++ b/files/unit_test/basictypes_test.cc
@@ -13,25 +13,15 @@
namespace libyuv {
-TEST_F(LibYUVBaseTest, Endian) {
- uint16 v16 = 0x1234u;
- uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
-#if defined(LIBYUV_LITTLE_ENDIAN)
- EXPECT_EQ(0x34u, first_byte);
-#else
- EXPECT_EQ(0x12u, first_byte);
-#endif
-}
-
TEST_F(LibYUVBaseTest, SizeOfTypes) {
- int8 i8 = -1;
- uint8 u8 = 1u;
- int16 i16 = -1;
- uint16 u16 = 1u;
- int32 i32 = -1;
- uint32 u32 = 1u;
- int64 i64 = -1;
- uint64 u64 = 1u;
+ int8_t i8 = -1;
+ uint8_t u8 = 1u;
+ int16_t i16 = -1;
+ uint16_t u16 = 1u;
+ int32_t i32 = -1;
+ uint32_t u32 = 1u;
+ int64_t i64 = -1;
+ uint64_t u64 = 1u;
EXPECT_EQ(1u, sizeof(i8));
EXPECT_EQ(1u, sizeof(u8));
EXPECT_EQ(2u, sizeof(i16));
@@ -50,11 +40,4 @@
EXPECT_LT(0u, u64);
}
-TEST_F(LibYUVBaseTest, SizeOfConstants) {
- EXPECT_EQ(8u, sizeof(INT64_C(0)));
- EXPECT_EQ(8u, sizeof(UINT64_C(0)));
- EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
- EXPECT_EQ(8u, sizeof(UINT64_C(0x8765432112345678)));
-}
-
} // namespace libyuv
diff --git a/files/unit_test/color_test.cc b/files/unit_test/color_test.cc
index 0aa7a54..4bb448d 100644
--- a/files/unit_test/color_test.cc
+++ b/files/unit_test/color_test.cc
@@ -63,10 +63,10 @@
\
/* The test is overall for color conversion matrix being reversible, so */ \
/* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \
- uint8* p = orig_y; \
+ uint8_t* p = orig_y; \
for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
- uint8 r = static_cast<uint8>(fastrand()); \
+ uint8_t r = static_cast<uint8_t>(fastrand()); \
p[0] = r; \
p[1] = r; \
p[HN] = r; \
@@ -74,7 +74,7 @@
p += 2; \
} \
if (benchmark_width_ & 1) { \
- uint8 r = static_cast<uint8>(fastrand()); \
+ uint8_t r = static_cast<uint8_t>(fastrand()); \
p[0] = r; \
p[HN] = r; \
p += 1; \
@@ -83,13 +83,13 @@
} \
if ((benchmark_height_ & 1) && HS == 2) { \
for (int x = 0; x < benchmark_width_ - 1; x += 2) { \
- uint8 r = static_cast<uint8>(fastrand()); \
+ uint8_t r = static_cast<uint8_t>(fastrand()); \
p[0] = r; \
p[1] = r; \
p += 2; \
} \
if (benchmark_width_ & 1) { \
- uint8 r = static_cast<uint8>(fastrand()); \
+ uint8_t r = static_cast<uint8_t>(fastrand()); \
p[0] = r; \
p += 1; \
} \
@@ -147,10 +147,10 @@
const int kPixels = kWidth * kHeight;
const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
- SIMD_ALIGNED(uint8 orig_y[16]);
- SIMD_ALIGNED(uint8 orig_u[8]);
- SIMD_ALIGNED(uint8 orig_v[8]);
- SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
memset(orig_y, y, kPixels);
memset(orig_u, u, kHalfPixels);
memset(orig_v, v, kHalfPixels);
@@ -170,10 +170,10 @@
const int kPixels = kWidth * kHeight;
const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
- SIMD_ALIGNED(uint8 orig_y[16]);
- SIMD_ALIGNED(uint8 orig_u[8]);
- SIMD_ALIGNED(uint8 orig_v[8]);
- SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_u[8]);
+ SIMD_ALIGNED(uint8_t orig_v[8]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
memset(orig_y, y, kPixels);
memset(orig_u, u, kHalfPixels);
memset(orig_v, v, kHalfPixels);
@@ -192,8 +192,8 @@
const int kHeight = 1;
const int kPixels = kWidth * kHeight;
- SIMD_ALIGNED(uint8 orig_y[16]);
- SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
memset(orig_y, y, kPixels);
/* YUV converted to ARGB. */
@@ -209,8 +209,8 @@
const int kHeight = 1;
const int kPixels = kWidth * kHeight;
- SIMD_ALIGNED(uint8 orig_y[16]);
- SIMD_ALIGNED(uint8 orig_pixels[16 * 4]);
+ SIMD_ALIGNED(uint8_t orig_y[16]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
memset(orig_y, y, kPixels);
/* YUV converted to ARGB. */
@@ -471,21 +471,22 @@
printf("\n");
}
+// Step by 5 on inner loop goes from 0 to 255 inclusive.
+// Set to 1 for better converage. 3, 5 or 17 for faster testing.
+#define FASTSTEP 5
TEST_F(LibYUVColorTest, TestFullYUV) {
- int rh[256] =
- {
- 0,
- },
- gh[256] =
- {
- 0,
- },
- bh[256] = {
- 0,
- };
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
- for (int y2 = 0; y2 < 256; ++y2) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
int r0, g0, b0, r1, g1, b1;
int y = RANDOM256(y2);
YUVToRGBReference(y, u, v, &r0, &g0, &b0);
@@ -503,20 +504,18 @@
}
TEST_F(LibYUVColorTest, TestFullYUVJ) {
- int rh[256] =
- {
- 0,
- },
- gh[256] =
- {
- 0,
- },
- bh[256] = {
- 0,
- };
+ int rh[256] = {
+ 0,
+ };
+ int gh[256] = {
+ 0,
+ };
+ int bh[256] = {
+ 0,
+ };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
- for (int y2 = 0; y2 < 256; ++y2) {
+ for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
int r0, g0, b0, r1, g1, b1;
int y = RANDOM256(y2);
YUVJToRGBReference(y, u, v, &r0, &g0, &b0);
@@ -532,6 +531,7 @@
}
PrintHistogram(rh, gh, bh);
}
+#undef FASTSTEP
TEST_F(LibYUVColorTest, TestGreyYUVJ) {
int r0, g0, b0, r1, g1, b1, r2, g2, b2;
diff --git a/files/unit_test/compare_test.cc b/files/unit_test/compare_test.cc
index 13f7470..136254e 100644
--- a/files/unit_test/compare_test.cc
+++ b/files/unit_test/compare_test.cc
@@ -15,14 +15,17 @@
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
#include "libyuv/cpu_id.h"
#include "libyuv/video_common.h"
namespace libyuv {
// hash seed of 5381 recommended.
-static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
- uint32 hash = seed;
+static uint32_t ReferenceHashDjb2(const uint8_t* src,
+ uint64_t count,
+ uint32_t seed) {
+ uint32_t hash = seed;
if (count > 0) {
do {
hash = hash * 33 + *src++;
@@ -31,7 +34,7 @@
return hash;
}
-TEST_F(LibYUVBaseTest, Djb2_Test) {
+TEST_F(LibYUVCompareTest, Djb2_Test) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest);
align_buffer_page_end(src_b, kMaxTest);
@@ -40,8 +43,8 @@
"The quick brown fox jumps over the lazy dog"
" and feels as if he were in the seventh heaven of typography"
" together with Hermann Zapf";
- uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
- const uint32 kExpectedFoxHash = 2611006483u;
+ uint32_t foxhash = HashDjb2(reinterpret_cast<const uint8_t*>(fox), 131, 5381);
+ const uint32_t kExpectedFoxHash = 2611006483u;
EXPECT_EQ(kExpectedFoxHash, foxhash);
for (int i = 0; i < kMaxTest; ++i) {
@@ -49,8 +52,8 @@
src_b[i] = (fastrand() & 0xff);
}
// Compare different buffers. Expect hash is different.
- uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
- uint32 h2 = HashDjb2(src_b, kMaxTest, 5381);
+ uint32_t h1 = HashDjb2(src_a, kMaxTest, 5381);
+ uint32_t h2 = HashDjb2(src_b, kMaxTest, 5381);
EXPECT_NE(h1, h2);
// Make last half same. Expect hash is different.
@@ -116,15 +119,15 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, BenchmarkDjb2_Opt) {
+TEST_F(LibYUVCompareTest, BenchmarkDjb2_Opt) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i] = i;
}
- uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
- uint32 h1;
+ uint32_t h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ uint32_t h1;
for (int i = 0; i < benchmark_iterations_; ++i) {
h1 = HashDjb2(src_a, kMaxTest, 5381);
}
@@ -132,14 +135,14 @@
free_aligned_buffer_page_end(src_a);
}
-TEST_F(LibYUVBaseTest, BenchmarkDjb2_Unaligned) {
+TEST_F(LibYUVCompareTest, BenchmarkDjb2_Unaligned) {
const int kMaxTest = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_a, kMaxTest + 1);
for (int i = 0; i < kMaxTest; ++i) {
src_a[i + 1] = i;
}
- uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
- uint32 h1;
+ uint32_t h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+ uint32_t h1;
for (int i = 0; i < benchmark_iterations_; ++i) {
h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
}
@@ -147,8 +150,8 @@
free_aligned_buffer_page_end(src_a);
}
-TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Opt) {
- uint32 fourcc;
+TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Opt) {
+ uint32_t fourcc;
const int kMaxTest = benchmark_width_ * benchmark_height_ * 4;
align_buffer_page_end(src_a, kMaxTest);
for (int i = 0; i < kMaxTest; ++i) {
@@ -158,12 +161,12 @@
src_a[0] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
- EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
+ EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
src_a[0] = 255;
src_a[3] = 0;
fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
- EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
+ EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
src_a[3] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
@@ -175,8 +178,8 @@
free_aligned_buffer_page_end(src_a);
}
-TEST_F(LibYUVBaseTest, BenchmarkARGBDetect_Unaligned) {
- uint32 fourcc;
+TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
+ uint32_t fourcc;
const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1;
align_buffer_page_end(src_a, kMaxTest);
for (int i = 1; i < kMaxTest; ++i) {
@@ -186,12 +189,12 @@
src_a[0 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
- EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_BGRA), fourcc);
+ EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_BGRA), fourcc);
src_a[0 + 1] = 255;
src_a[3 + 1] = 0;
fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
- EXPECT_EQ(static_cast<uint32>(libyuv::FOURCC_ARGB), fourcc);
+ EXPECT_EQ(static_cast<uint32_t>(libyuv::FOURCC_ARGB), fourcc);
src_a[3 + 1] = 255;
for (int i = 0; i < benchmark_iterations_; ++i) {
@@ -202,7 +205,99 @@
free_aligned_buffer_page_end(src_a);
}
-TEST_F(LibYUVBaseTest, BenchmarkSumSquareError_Opt) {
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ // Test known value
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
+#elif defined(HAS_HAMMINGDISTANCE_AVX2)
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ if (has_avx2) {
+ h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
+ } else {
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ if (has_sse42) {
+ h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
+ } else {
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ if (has_ssse3) {
+ h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
+ } else {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+ }
+ }
+#elif defined(HAS_HAMMINGDISTANCE_SSE42)
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ if (has_sse42) {
+ h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
+ } else {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+#else
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+#endif
+ }
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ // Test known value
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint32_t h1 = HammingDistance_C(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
@@ -211,7 +306,131 @@
memcpy(src_a, "test0123test4567", 16);
memcpy(src_b, "tick0123tock4567", 16);
- uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+ uint64_t h1 = ComputeHammingDistance(src_a, src_b, 16);
+ EXPECT_EQ(16u, h1);
+
+ // Test C vs OPT on random buffer
+ MemRandomize(src_a, kMaxWidth);
+ MemRandomize(src_b, kMaxWidth);
+
+ uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth);
+
+ int count =
+ benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(h0, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+// Tests low levels match reference C for specified size.
+// The opt implementations have size limitations
+// For NEON the counters are 16 bit so the shorts overflow after 65536 bytes.
+// So doing one less iteration of the loop is the maximum.
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+static const int kMaxOptCount = 65536 - 32; // 65504
+#else
+static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
+#endif
+
+TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
+ uint32_t h1 = 0;
+ const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 255u, kMaxWidth);
+ memset(src_b, 0u, kMaxWidth);
+
+ uint64_t h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
+ EXPECT_EQ(kMaxWidth * 8ULL, h0);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+ h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
+#elif defined(HAS_HAMMINGDISTANCE_AVX2)
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ if (has_avx2) {
+ h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
+ } else {
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ if (has_sse42) {
+ h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
+ } else {
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ if (has_ssse3) {
+ h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
+ } else {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+ }
+ }
+#elif defined(HAS_HAMMINGDISTANCE_SSE42)
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ if (has_sse42) {
+ h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
+ } else {
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+ }
+#else
+ h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
+#endif
+ }
+
+ // A large count will cause the low level to potentially overflow so the
+ // result can not be expected to be correct.
+ // TODO(fbarchard): Consider expecting the low 16 bits to match.
+ if (kMaxWidth <= kMaxOptCount) {
+ EXPECT_EQ(kMaxWidth * 8U, h1);
+ } else {
+ if (kMaxWidth * 8ULL != static_cast<uint64_t>(h1)) {
+ printf(
+ "warning - HammingDistance_Opt %u does not match %llu "
+ "but length of %u is longer than guaranteed.\n",
+ h1, kMaxWidth * 8ULL, kMaxWidth);
+ } else {
+ printf(
+ "warning - HammingDistance_Opt %u matches but length of %u "
+ "is longer than guaranteed.\n",
+ h1, kMaxWidth);
+ }
+ }
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, TestHammingDistance) {
+ align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
+ align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
+ memset(src_a, 255u, benchmark_width_ * benchmark_height_);
+ memset(src_b, 0, benchmark_width_ * benchmark_height_);
+
+ uint64_t h1 = 0;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = ComputeHammingDistance(src_a, src_b,
+ benchmark_width_ * benchmark_height_);
+ }
+ EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1);
+
+ free_aligned_buffer_page_end(src_a);
+ free_aligned_buffer_page_end(src_b);
+}
+
+TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_page_end(src_a, kMaxWidth);
+ align_buffer_page_end(src_b, kMaxWidth);
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint64_t h1 = ComputeSumSquareError(src_a, src_b, 16);
EXPECT_EQ(790u, h1);
for (int i = 0; i < kMaxWidth; ++i) {
@@ -234,14 +453,14 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, SumSquareError) {
+TEST_F(LibYUVCompareTest, SumSquareError) {
const int kMaxWidth = 4096 * 3;
align_buffer_page_end(src_a, kMaxWidth);
align_buffer_page_end(src_b, kMaxWidth);
memset(src_a, 0, kMaxWidth);
memset(src_b, 0, kMaxWidth);
- uint64 err;
+ uint64_t err;
err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
EXPECT_EQ(0u, err);
@@ -263,10 +482,10 @@
}
MaskCpuFlags(disable_cpu_flags_);
- uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ uint64_t c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
MaskCpuFlags(benchmark_cpu_info_);
- uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ uint64_t opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
EXPECT_EQ(c_err, opt_err);
@@ -274,7 +493,7 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, BenchmarkPsnr_Opt) {
+TEST_F(LibYUVCompareTest, BenchmarkPsnr_Opt) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
@@ -285,9 +504,10 @@
MaskCpuFlags(benchmark_cpu_info_);
double opt_time = get_time();
- for (int i = 0; i < benchmark_iterations_; ++i)
+ for (int i = 0; i < benchmark_iterations_; ++i) {
CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
+ }
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
@@ -298,7 +518,7 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, BenchmarkPsnr_Unaligned) {
+TEST_F(LibYUVCompareTest, BenchmarkPsnr_Unaligned) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
@@ -309,9 +529,10 @@
MaskCpuFlags(benchmark_cpu_info_);
double opt_time = get_time();
- for (int i = 0; i < benchmark_iterations_; ++i)
+ for (int i = 0; i < benchmark_iterations_; ++i) {
CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
+ }
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
@@ -322,7 +543,7 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, Psnr) {
+TEST_F(LibYUVCompareTest, Psnr) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;
const int b = 128;
@@ -399,7 +620,7 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, DISABLED_BenchmarkSsim_Opt) {
+TEST_F(LibYUVCompareTest, DISABLED_BenchmarkSsim_Opt) {
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
@@ -410,9 +631,10 @@
MaskCpuFlags(benchmark_cpu_info_);
double opt_time = get_time();
- for (int i = 0; i < benchmark_iterations_; ++i)
+ for (int i = 0; i < benchmark_iterations_; ++i) {
CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
+ }
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
@@ -423,7 +645,7 @@
free_aligned_buffer_page_end(src_b);
}
-TEST_F(LibYUVBaseTest, Ssim) {
+TEST_F(LibYUVCompareTest, Ssim) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;
const int b = 128;
diff --git a/files/unit_test/convert_test.cc b/files/unit_test/convert_test.cc
index 4156435..32a4cd1 100644
--- a/files/unit_test/convert_test.cc
+++ b/files/unit_test/convert_test.cc
@@ -8,9 +8,12 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <stdlib.h>
#include <time.h>
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@@ -26,102 +29,91 @@
#include "libyuv/rotate.h"
#include "libyuv/video_common.h"
+#if defined(__arm__) || defined(__aarch64__)
+// arm version subsamples by summing 4 pixels then multiplying by matrix with
+// 4x smaller coefficients which are rounded to nearest integer.
+#define ARM_YUV_ERROR 4
+#else
+#define ARM_YUV_ERROR 0
+#endif
+
namespace libyuv {
+// Alias to copy pixels as is
+#define AR30ToAR30 ARGBCopy
+#define ABGRToABGR ARGBCopy
+
#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+// Planar test
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
+ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
+ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
+ "DST SRC_SUBSAMP_X unsupported"); \
+ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
+ "DST SRC_SUBSAMP_Y unsupported"); \
+ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
+ "DST DST_SUBSAMP_X unsupported"); \
+ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
+ "DST DST_SUBSAMP_Y unsupported"); \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
- OFF); \
- align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (fastrand() & 0xff); \
- } \
- } \
- memset(dst_y_c, 1, kWidth* kHeight); \
- memset(dst_u_c, 2, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 3, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 101, kWidth* kHeight); \
- memset(dst_u_opt, 102, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 103, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
+ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
+ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
+ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_u, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(src_v, \
+ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
+ align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
+ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
+ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
+ memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
+ memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
+ reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
+ reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
- src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
+ reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
+ reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
+ reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
+ reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
+ reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
+ NEG kHeight); \
} \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
+ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
} \
- EXPECT_EQ(0, max_diff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
+ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
+ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
+ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
} \
- EXPECT_LE(max_diff, 3); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = abs( \
- static_cast<int>(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>( \
- dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 3); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_u_c); \
free_aligned_buffer_page_end(dst_v_c); \
@@ -133,25 +125,36 @@
free_aligned_buffer_page_end(src_v); \
}
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
- SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
+ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
+ DST_SUBSAMP_X, DST_SUBSAMP_Y) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
-TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
-TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
-TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
-TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
-TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
-TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
+TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
+TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
+TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
+TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
+TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
+TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
// Test Android 420 to I420
#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
@@ -175,8 +178,8 @@
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- uint8* src_u = src_uv + OFF_U; \
- uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
+ uint8_t* src_u = src_uv + OFF_U; \
+ uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \
int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
@@ -278,6 +281,23 @@
TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2)
TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2)
+// wrapper to keep API the same
+int I400ToNV21(const uint8_t* src_y,
+ int src_stride_y,
+ const uint8_t* /* src_u */,
+ int /* src_stride_u */,
+ const uint8_t* /* src_v */,
+ int /* src_stride_v */,
+ uint8_t* dst_y,
+ int dst_stride_y,
+ uint8_t* dst_vu,
+ int dst_stride_vu,
+ int width,
+ int height) {
+ return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu,
+ dst_stride_vu, width, height);
+}
+
#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
@@ -291,10 +311,10 @@
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
- align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
@@ -309,21 +329,21 @@
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
- dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
@@ -337,12 +357,12 @@
} \
EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
int abs_diff = \
abs(static_cast<int>( \
- dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
+ dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
static_cast<int>( \
- dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
+ dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
@@ -371,6 +391,102 @@
TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
+TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
+TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
+ OFF) \
+ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+ OFF); \
+ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
+ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] = \
+ (fastrand() & 0xff); \
+ src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] = \
+ (fastrand() & 0xff); \
+ } \
+ } \
+ memset(dst_y_c, 1, kWidth* kHeight); \
+ memset(dst_uv_c, 2, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 101, kWidth* kHeight); \
+ memset(dst_uv_opt, 102, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR( \
+ src_y + OFF, kWidth, src_uv + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>( \
+ dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
+ static_cast<int>( \
+ dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_page_end(dst_y_c); \
+ free_aligned_buffer_page_end(dst_uv_c); \
+ free_aligned_buffer_page_end(dst_y_opt); \
+ free_aligned_buffer_page_end(dst_uv_opt); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_uv); \
+ }
+
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
+ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+ SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+
+// TODO(fbarchard): Fix msan on this unittest
+// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
@@ -491,108 +607,102 @@
#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
- TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
- const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
- const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
- const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
- align_buffer_page_end(src_u, kSizeUV + OFF); \
- align_buffer_page_end(src_v, kSizeUV + OFF); \
- align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
- align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (fastrand() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (fastrand() & 0xff); \
- src_v[i + OFF] = (fastrand() & 0xff); \
- } \
- memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
- memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
- MaskCpuFlags(disable_cpu_flags_); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(benchmark_cpu_info_); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
- src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
- kStrideB, kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
- align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \
- memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \
- memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \
- FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \
- kWidth * BPP_C, kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_y); \
- free_aligned_buffer_page_end(src_u); \
- free_aligned_buffer_page_end(src_v); \
- free_aligned_buffer_page_end(dst_argb_c); \
- free_aligned_buffer_page_end(dst_argb_opt); \
- free_aligned_buffer_page_end(dst_argb32_c); \
- free_aligned_buffer_page_end(dst_argb32_opt); \
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, N, NEG, OFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
+ align_buffer_page_end(src_u, kSizeUV + OFF); \
+ align_buffer_page_end(src_v, kSizeUV + OFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (fastrand() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (fastrand() & 0xff); \
+ src_v[i + OFF] = (fastrand() & 0xff); \
+ } \
+ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ double time0 = get_time(); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ double time1 = get_time(); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
+ src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
+ kStrideB, kWidth, NEG kHeight); \
+ } \
+ double time2 = get_time(); \
+ printf(" %8d us C - %8d us OPT\n", \
+ static_cast<int>((time1 - time0) * 1e6), \
+ static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \
+ } \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
}
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, \
- BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, _Opt, +, 0)
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1, 9, ARGB, 4)
-TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
-TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
+TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
+TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
+TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \
@@ -663,8 +773,8 @@
TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- W1280, DIFF, N, NEG, OFF) \
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
+ BPP_B, W1280, DIFF, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
@@ -699,9 +809,9 @@
align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \
memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \
memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \
- FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
+ FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \
kHeight); \
- FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
+ FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \
kHeight); \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
@@ -723,21 +833,27 @@
free_aligned_buffer_page_end(dst_argb32_opt); \
}
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ DIFF) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
benchmark_width_, DIFF, _Opt, +, 0)
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
+TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
#ifdef DO_THREE_PLANES
// Do 3 allocations for yuv. conventional but slower.
@@ -864,33 +980,27 @@
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
benchmark_width_, DIFF, _Opt, +, 0)
-TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
-#if defined(__arm__) || defined(__aarch64__)
-// arm version subsamples by summing 4 pixels then multiplying by matrix with
-// 4x smaller coefficients which are rounded to nearest integer.
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 4)
-#else
-TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
-TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, 0)
-#endif
-TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
+TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+// TODO(fbarchard): Investigate J420 error of 11 on Windows.
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 11)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
SUBSAMP_Y, W1280, N, NEG, OFF) \
@@ -964,6 +1074,8 @@
TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
@@ -1032,15 +1144,9 @@
MaskCpuFlags(benchmark_cpu_info_); \
FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
kHeight); \
- int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
+ EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
} \
- EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
@@ -1060,37 +1166,47 @@
TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
HEIGHT_B, DIFF)
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+// TODO(fbarchard): make ARM version of C code that matches NEON.
+TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
+TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
+TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
HEIGHT_B, W1280, DIFF, N, NEG, OFF) \
@@ -1240,8 +1356,8 @@
TESTSYM(ABGRToARGB, 4, 4, 1)
TEST_F(LibYUVConvertTest, Test565) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 pixels565[256][2]);
+ SIMD_ALIGNED(uint8_t orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8_t pixels565[256][2]);
for (int i = 0; i < 256; ++i) {
for (int j = 0; j < 4; ++j) {
@@ -1249,7 +1365,7 @@
}
}
ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
- uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
EXPECT_EQ(610919429u, checksum);
}
@@ -1274,6 +1390,7 @@
// EOI, SOI. Expect pass.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1300,6 +1417,7 @@
// EOI, SOI. Expect pass.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - kOff + 0] = 0xff;
orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1333,6 +1451,7 @@
// SOI but no EOI. Expect fail.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
for (int times = 0; times < benchmark_iterations_; ++times) {
EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
}
@@ -1350,85 +1469,628 @@
TEST_F(LibYUVConvertTest, FuzzJpeg) {
// SOI but no EOI. Expect fail.
for (int times = 0; times < benchmark_iterations_; ++times) {
- const int kSize = fastrand() % 5000 + 2;
+ const int kSize = fastrand() % 5000 + 3;
align_buffer_page_end(orig_pixels, kSize);
MemRandomize(orig_pixels, kSize);
// Add SOI so frame will be scanned.
orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[2] = 0xff;
orig_pixels[kSize - 1] = 0xff;
- ValidateJpeg(orig_pixels, kSize); // Failure normally expected.
+ ValidateJpeg(orig_pixels,
+ kSize); // Failure normally expected.
free_aligned_buffer_page_end(orig_pixels);
}
}
-TEST_F(LibYUVConvertTest, MJPGToI420) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
- align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_);
- align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
- align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) *
- SUBSAMPLE(benchmark_height_, 2));
+// Test data created in GIMP. In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
+// test 0 is J400
+static const uint8_t kTest0Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10,
+ 0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01,
+ 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest0JpgLen = 421;
- // EOI, SOI to make MJPG appear valid.
- memset(orig_pixels, 0, kSize);
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+// test 1 is J444
+static const uint8_t kTest1Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda,
+ 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01,
+ 0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb,
+ 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11,
+ 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00,
+ 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99,
+ 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31,
+ 0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01,
+ 0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72,
+ 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11,
+ 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00,
+ 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2,
+ 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c,
+ 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61,
+ 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21,
+ 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01,
+ 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48,
+ 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01,
+ 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff,
+ 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26,
+ 0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02,
+ 0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5,
+ 0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00,
+ 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61,
+ 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01,
+ 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a,
+ 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96,
+ 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad,
+ 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7,
+ 0xd4, 0xff, 0xd9};
+static const size_t kTest1JpgLen = 735;
- for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret =
- MJPGToI420(orig_pixels, kSize, dst_y_opt, benchmark_width_, dst_u_opt,
- SUBSAMPLE(benchmark_width_, 2), dst_v_opt,
- SUBSAMPLE(benchmark_width_, 2), benchmark_width_,
- benchmark_height_, benchmark_width_, benchmark_height_);
- // Expect failure because image is not really valid.
- EXPECT_EQ(1, ret);
- }
+// test 2 is J420
+static const uint8_t kTest2Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00,
+ 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10,
+ 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02,
+ 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62,
+ 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f,
+ 0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f,
+ 0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03,
+ 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28,
+ 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4,
+ 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51,
+ 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb,
+ 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9,
+ 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c,
+ 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f,
+ 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11,
+ 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e,
+ 0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01,
+ 0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10,
+ 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91,
+ 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01,
+ 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b,
+ 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec,
+ 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c,
+ 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff,
+ 0xd9};
+static const size_t kTest2JpgLen = 685;
- free_aligned_buffer_page_end(dst_y_opt);
- free_aligned_buffer_page_end(dst_u_opt);
- free_aligned_buffer_page_end(dst_v_opt);
- free_aligned_buffer_page_end(orig_pixels);
+// test 3 is J422
+static const uint8_t kTest3Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4,
+ 0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03,
+ 0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84,
+ 0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda,
+ 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32,
+ 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00,
+ 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31,
+ 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f,
+ 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9,
+ 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6,
+ 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03,
+ 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff,
+ 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21,
+ 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53,
+ 0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca,
+ 0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04,
+ 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9,
+ 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5,
+ 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c,
+ 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00,
+ 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest3JpgLen = 704;
+
+// test 4 is J422 vertical - not supported
+static const uint8_t kTest4Jpg[] = {
+ 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
+ 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43,
+ 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12,
+ 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23,
+ 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40,
+ 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51,
+ 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64,
+ 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12,
+ 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63,
+ 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03,
+ 0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00,
+ 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff,
+ 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff,
+ 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00,
+ 0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4,
+ 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03,
+ 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05,
+ 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26,
+ 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01,
+ 0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00,
+ 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff,
+ 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02,
+ 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9,
+ 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01,
+ 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0,
+ 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e,
+ 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde,
+ 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a,
+ 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02,
+ 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19,
+ 0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff,
+ 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca,
+ 0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01,
+ 0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff,
+ 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31,
+ 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08,
+ 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a,
+ 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd,
+ 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30,
+ 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03,
+ 0x0b, 0xb7, 0xd4, 0xff, 0xd9};
+static const size_t kTest4JpgLen = 701;
+
+TEST_F(LibYUVConvertTest, TestMJPGSize) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ printf("test jpeg size %d x %d\n", width, height);
}
-TEST_F(LibYUVConvertTest, MJPGToARGB) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg
- ? benchmark_width_ * benchmark_height_
- : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_page_end(orig_pixels, kSize);
- align_buffer_page_end(dst_argb_opt, benchmark_width_ * benchmark_height_ * 4);
+TEST_F(LibYUVConvertTest, TestMJPGToI420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
- // EOI, SOI to make MJPG appear valid.
- memset(orig_pixels, 0, kSize);
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
- for (int times = 0; times < benchmark_iterations_; ++times) {
- int ret = MJPGToARGB(orig_pixels, kSize, dst_argb_opt, benchmark_width_ * 4,
- benchmark_width_, benchmark_height_, benchmark_width_,
- benchmark_height_);
- // Expect failure because image is not really valid.
- EXPECT_EQ(1, ret);
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_u, half_width * half_height);
+ align_buffer_page_end(dst_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width,
+ dst_v, half_width, width, height, width, height);
}
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
- free_aligned_buffer_page_end(dst_argb_opt);
- free_aligned_buffer_page_end(orig_pixels);
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381);
+ uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_u_hash, 2501859930u);
+ EXPECT_EQ(dst_v_hash, 2126459123u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
}
+TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ // Convert to NV21
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_vu, half_width * half_height * 2);
+
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert to I420
+ align_buffer_page_end(dst2_y, width * height);
+ align_buffer_page_end(dst2_u, half_width * half_height);
+ align_buffer_page_end(dst2_v, half_width * half_height);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width,
+ dst2_v, half_width, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Convert I420 to NV21
+ align_buffer_page_end(dst3_y, width * height);
+ align_buffer_page_end(dst3_vu, half_width * half_height * 2);
+
+ I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y,
+ width, dst3_vu, half_width * 2, width, height);
+
+ for (int i = 0; i < width * height; ++i) {
+ EXPECT_EQ(dst_y[i], dst3_y[i]);
+ }
+ for (int i = 0; i < half_width * half_height * 2; ++i) {
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ EXPECT_EQ(dst_vu[i], dst3_vu[i]);
+ }
+
+ free_aligned_buffer_page_end(dst3_y);
+ free_aligned_buffer_page_end(dst3_vu);
+
+ free_aligned_buffer_page_end(dst2_y);
+ free_aligned_buffer_page_end(dst2_u);
+ free_aligned_buffer_page_end(dst2_v);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_vu);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 1069662856u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 3543430771u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 330644005u);
+ EXPECT_EQ(dst_uv_hash, 135214341u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int half_width = (width + 1) / 2;
+ int half_height = (height + 1) / 2;
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_y, width * height);
+ align_buffer_page_end(dst_uv, half_width * half_height * 2);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv,
+ half_width * 2, width, height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381);
+ uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381);
+ EXPECT_EQ(dst_y_hash, 2682851208u);
+ EXPECT_EQ(dst_uv_hash, 506143297u);
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_uv);
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
+ int width = 0;
+ int height = 0;
+ int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height);
+ EXPECT_EQ(0, ret);
+
+ int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
+ benchmark_height_ / (width * height);
+
+ align_buffer_page_end(dst_argb, width * height * 4);
+ for (int times = 0; times < benchmark_iterations; ++times) {
+ ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width,
+ height, width, height);
+ }
+ // Expect sucesss
+ EXPECT_EQ(0, ret);
+
+ // Test result matches known hash value.
+ uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
+ EXPECT_EQ(dst_argb_hash, 2355976473u);
+
+ free_aligned_buffer_page_end(dst_argb);
+}
+
+static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+
+ int width = mjpeg_decoder.GetWidth();
+ int height = mjpeg_decoder.GetHeight();
+
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J420, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J422, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ printf("JPeg is J444, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ printf("JPeg is J400, %dx%d %d bytes\n", width, height,
+ static_cast<int>(sample_size));
+ } else {
+ // Unknown colorspace.
+ printf("JPeg is Unknown colorspace.\n");
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret;
+}
+
+TEST_F(LibYUVConvertTest, TestMJPGInfo) {
+ EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
+ EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+ kTest4JpgLen)); // Valid but unsupported.
+}
#endif // HAVE_JPEG
TEST_F(LibYUVConvertTest, NV12Crop) {
@@ -1444,7 +2106,7 @@
const int sample_size =
kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2;
align_buffer_page_end(src_y, sample_size);
- uint8* src_uv = src_y + kWidth * kHeight;
+ uint8_t* src_uv = src_y + kWidth * kHeight;
align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
@@ -1511,14 +2173,86 @@
free_aligned_buffer_page_end(src_y);
}
+TEST_F(LibYUVConvertTest, I420CropOddY) {
+ const int SUBSAMP_X = 2;
+ const int SUBSAMP_Y = 2;
+ const int kWidth = benchmark_width_;
+ const int kHeight = benchmark_height_;
+ const int crop_y = 1;
+ const int kDestWidth = benchmark_width_;
+ const int kDestHeight = benchmark_height_ - crop_y * 2;
+ const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X);
+ const int sample_size = kWidth * kHeight +
+ kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) +
+ kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y);
+ align_buffer_page_end(src_y, sample_size);
+ uint8_t* src_u = src_y + kWidth * kHeight;
+ uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y);
+
+ align_buffer_page_end(dst_y, kDestWidth * kDestHeight);
+ align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) *
+ SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ for (int i = 0; i < kHeight * kWidth; ++i) {
+ src_y[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) {
+ src_u[i] = (fastrand() & 0xff);
+ }
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) {
+ src_v[i] = (fastrand() & 0xff);
+ }
+ memset(dst_y, 1, kDestWidth * kDestHeight);
+ memset(dst_u, 2,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+ memset(dst_v, 3,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y));
+
+ MaskCpuFlags(benchmark_cpu_info_);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v,
+ SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight,
+ kDestWidth, kDestHeight, libyuv::kRotate0,
+ libyuv::FOURCC_I420);
+ }
+
+ for (int i = 0; i < kDestHeight; ++i) {
+ for (int j = 0; j < kDestWidth; ++j) {
+ EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j],
+ dst_y[i * kDestWidth + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j],
+ dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+ for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) {
+ for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) {
+ EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j],
+ dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]);
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y);
+ free_aligned_buffer_page_end(dst_u);
+ free_aligned_buffer_page_end(dst_v);
+ free_aligned_buffer_page_end(src_y);
+}
+
TEST_F(LibYUVConvertTest, TestYToARGB) {
- uint8 y[32];
- uint8 expectedg[32];
+ uint8_t y[32];
+ uint8_t expectedg[32];
for (int i = 0; i < 32; ++i) {
y[i] = i * 5 + 17;
expectedg[i] = static_cast<int>((y[i] - 16) * 1.164f + 0.5f);
}
- uint8 argb[32 * 4];
+ uint8_t argb[32 * 4];
YToARGB(y, 0, argb, 0, 32, 1);
for (int i = 0; i < 32; ++i) {
@@ -1530,7 +2264,7 @@
}
}
-static const uint8 kNoDither4x4[16] = {
+static const uint8_t kNoDither4x4[16] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
@@ -1557,7 +2291,7 @@
}
// Ordered 4x4 dither for 888 to 565. Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
+static const uint8_t kDither565_4x4[16] = {
0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
};
@@ -1728,6 +2462,8 @@
TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12)
TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12)
+// Transitive tests. A to B to C is same as A to C.
+
#define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \
W1280, N, NEG, OFF, FMT_C, BPP_C) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##_##FMT_C##N) { \
@@ -1800,10 +2536,15 @@
TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3)
TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
+TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
+TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
+TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
-TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
@@ -1888,6 +2629,64 @@
TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
+ OFF, FMT_C, BPP_C) \
+ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_##FMT_C##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \
+ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \
+ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \
+ MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \
+ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \
+ const int kStrideC = kWidth * BPP_C; \
+ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \
+ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \
+ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \
+ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \
+ FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \
+ kWidth, NEG kHeight); \
+ /* Convert B to C */ \
+ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, kStrideC, \
+ kWidth, kHeight); \
+ for (int i = 0; i < kStrideC * kHeight; i += 4) { \
+ EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]); \
+ EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]); \
+ EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]); \
+ EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64); \
+ } \
+ free_aligned_buffer_page_end(src_argb_a); \
+ free_aligned_buffer_page_end(dst_argb_b); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_bc); \
+ }
+
+#define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, \
+ benchmark_width_ - 4, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \
+ _Opt, +, 0, FMT_C, BPP_C)
+
+// Caveat: Destination needs to be 4 bytes
+TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
+TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
+TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
+TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
// 2x2 frames
uint32_t src[4];
@@ -1923,4 +2722,502 @@
EXPECT_EQ(dst[3], src[1]);
}
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
+ // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels.
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+ align_buffer_page_end(src, kPixels * 4);
+ align_buffer_page_end(dst_opt, kPixels * 4);
+ align_buffer_page_end(dst_c, kPixels * 4);
+ MemRandomize(src, kPixels * 4);
+ memset(dst_opt, 0, kPixels * 4);
+ memset(dst_c, 1, kPixels * 4);
+
+ ARGBToAR30Row_C(src, dst_c, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
+ } else if (has_ssse3) {
+ ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
+ } else {
+ ARGBToAR30Row_C(src, dst_opt, kPixels);
+ }
+ }
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_opt[i], dst_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(dst_c);
+}
+#endif // HAS_ARGBTOAR30ROW_AVX2
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
+ // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
+ align_buffer_page_end(src, kPixels * 4);
+ align_buffer_page_end(dst_opt, kPixels * 4);
+ align_buffer_page_end(dst_c, kPixels * 4);
+ MemRandomize(src, kPixels * 4);
+ memset(dst_opt, 0, kPixels * 4);
+ memset(dst_c, 1, kPixels * 4);
+
+ ABGRToAR30Row_C(src, dst_c, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ ABGRToAR30Row_AVX2(src, dst_opt, kPixels);
+ } else if (has_ssse3) {
+ ABGRToAR30Row_SSSE3(src, dst_opt, kPixels);
+ } else {
+ ABGRToAR30Row_C(src, dst_opt, kPixels);
+ }
+ }
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_opt[i], dst_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src);
+ free_aligned_buffer_page_end(dst_opt);
+ free_aligned_buffer_page_end(dst_c);
+}
+#endif // HAS_ABGRTOAR30ROW_AVX2
+
+// TODO(fbarchard): Fix clamping issue affected by U channel.
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \
+ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \
+ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ const int kBpc = 2; \
+ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \
+ align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \
+ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \
+ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & 0x3ff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & 0x3ff); \
+ reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & 0x3ff); \
+ } \
+ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \
+ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \
+ MaskCpuFlags(disable_cpu_flags_); \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \
+ MaskCpuFlags(benchmark_cpu_info_); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B( \
+ reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth, \
+ reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV, \
+ reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV, \
+ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \
+ int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) - \
+ static_cast<int>(dst_argb_opt[i + DOFF])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_y); \
+ free_aligned_buffer_page_end(src_u); \
+ free_aligned_buffer_page_end(src_v); \
+ free_aligned_buffer_page_end(dst_argb_c); \
+ free_aligned_buffer_page_end(dst_argb_opt); \
+ }
+
+#define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \
+ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0)
+
+TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
+TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
+TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
+TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
+TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
+TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
+TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
+TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
+
+static int Clamp(int y) {
+ if (y < 0) {
+ y = 0;
+ }
+ if (y > 255) {
+ y = 255;
+ }
+ return y;
+}
+
+static int Clamp10(int y) {
+ if (y < 0) {
+ y = 0;
+ }
+ if (y > 1023) {
+ y = 1023;
+ }
+ return y;
+}
+
+// Test 8 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToARGB) {
+ const int kSize = 256;
+ int histogram_b[256];
+ int histogram_g[256];
+ int histogram_r[256];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ uint8_t* orig_y = orig_yuv;
+ uint8_t* orig_u = orig_y + kSize;
+ uint8_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 128; // 128 is 0.
+ orig_v[i] = 128;
+ }
+
+ H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b = argb_pixels[i * 4 + 0];
+ int g = argb_pixels[i * 4 + 1];
+ int r = argb_pixels[i * 4 + 2];
+ int a = argb_pixels[i * 4 + 3];
+ ++histogram_b[b];
+ ++histogram_g[g];
+ ++histogram_r[r];
+ int expected_y = Clamp(static_cast<int>((i - 16) * 1.164f));
+ EXPECT_NEAR(b, expected_y, 1);
+ EXPECT_NEAR(g, expected_y, 1);
+ EXPECT_NEAR(r, expected_y, 1);
+ EXPECT_EQ(a, 255);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 8 bit RGB
+TEST_F(LibYUVConvertTest, TestH010ToARGB) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b = argb_pixels[i * 4 + 0];
+ int g = argb_pixels[i * 4 + 1];
+ int r = argb_pixels[i * 4 + 2];
+ int a = argb_pixels[i * 4 + 3];
+ ++histogram_b[b];
+ ++histogram_g[g];
+ ++histogram_r[r];
+ int expected_y = Clamp(static_cast<int>((i - 64) * 1.164f / 4));
+ EXPECT_NEAR(b, expected_y, 1);
+ EXPECT_NEAR(g, expected_y, 1);
+ EXPECT_NEAR(r, expected_y, 1);
+ EXPECT_EQ(a, 255);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(argb_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAR30) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(ar30_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+ int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test 10 bit YUV to 10 bit RGB
+// Caveat: Result is near due to float rounding in expected
+// result.
+TEST_F(LibYUVConvertTest, TestH010ToAB30) {
+ const int kSize = 1024;
+ int histogram_b[1024];
+ int histogram_g[1024];
+ int histogram_r[1024];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+
+ align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2);
+ align_buffer_page_end(ab30_pixels, kSize * 4);
+ uint16_t* orig_y = reinterpret_cast<uint16_t*>(orig_yuv);
+ uint16_t* orig_u = orig_y + kSize;
+ uint16_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 512; // 512 is 0.
+ orig_v[i] = 512;
+ }
+
+ H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int r10 = reinterpret_cast<uint32_t*>(ab30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 10) & 1023;
+ int b10 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ab30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 64) * 1.164f));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ab30_pixels);
+}
+
+// Test 8 bit YUV to 10 bit RGB
+TEST_F(LibYUVConvertTest, TestH420ToAR30) {
+ const int kSize = 256;
+ const int kHistSize = 1024;
+ int histogram_b[kHistSize];
+ int histogram_g[kHistSize];
+ int histogram_r[kHistSize];
+ memset(histogram_b, 0, sizeof(histogram_b));
+ memset(histogram_g, 0, sizeof(histogram_g));
+ memset(histogram_r, 0, sizeof(histogram_r));
+ align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2);
+ align_buffer_page_end(ar30_pixels, kSize * 4);
+ uint8_t* orig_y = orig_yuv;
+ uint8_t* orig_u = orig_y + kSize;
+ uint8_t* orig_v = orig_u + kSize / 2;
+
+ // Test grey scale
+ for (int i = 0; i < kSize; ++i) {
+ orig_y[i] = i;
+ }
+ for (int i = 0; i < kSize / 2; ++i) {
+ orig_u[i] = 128; // 128 is 0.
+ orig_v[i] = 128;
+ }
+
+ H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1);
+
+ for (int i = 0; i < kSize; ++i) {
+ int b10 = reinterpret_cast<uint32_t*>(ar30_pixels)[i] & 1023;
+ int g10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 10) & 1023;
+ int r10 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 20) & 1023;
+ int a2 = (reinterpret_cast<uint32_t*>(ar30_pixels)[i] >> 30) & 3;
+ ++histogram_b[b10];
+ ++histogram_g[g10];
+ ++histogram_r[r10];
+ int expected_y = Clamp10(static_cast<int>((i - 16) * 1.164f * 4.f));
+ EXPECT_NEAR(b10, expected_y, 4);
+ EXPECT_NEAR(g10, expected_y, 4);
+ EXPECT_NEAR(r10, expected_y, 4);
+ EXPECT_EQ(a2, 3);
+ }
+
+ int count_b = 0;
+ int count_g = 0;
+ int count_r = 0;
+ for (int i = 0; i < kHistSize; ++i) {
+ if (histogram_b[i]) {
+ ++count_b;
+ }
+ if (histogram_g[i]) {
+ ++count_g;
+ }
+ if (histogram_r[i]) {
+ ++count_r;
+ }
+ }
+ printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r);
+
+ free_aligned_buffer_page_end(orig_yuv);
+ free_aligned_buffer_page_end(ar30_pixels);
+}
+
+// Test RGB24 to ARGB and back to RGB24
+TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
+ const int kSize = 256;
+ align_buffer_page_end(orig_rgb24, kSize * 3);
+ align_buffer_page_end(argb_pixels, kSize * 4);
+ align_buffer_page_end(dest_rgb24, kSize * 3);
+
+ // Test grey scale
+ for (int i = 0; i < kSize * 3; ++i) {
+ orig_rgb24[i] = i;
+ }
+
+ RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
+ ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
+
+ for (int i = 0; i < kSize * 3; ++i) {
+ EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
+ }
+
+ free_aligned_buffer_page_end(orig_rgb24);
+ free_aligned_buffer_page_end(argb_pixels);
+ free_aligned_buffer_page_end(dest_rgb24);
+}
+
} // namespace libyuv
diff --git a/files/unit_test/cpu_test.cc b/files/unit_test/cpu_test.cc
index 048ed31..a7991d2 100644
--- a/files/unit_test/cpu_test.cc
+++ b/files/unit_test/cpu_test.cc
@@ -20,39 +20,56 @@
TEST_F(LibYUVBaseTest, TestCpuHas) {
int cpu_flags = TestCpuFlag(-1);
- printf("Cpu Flags %x\n", cpu_flags);
+ printf("Cpu Flags %d\n", cpu_flags);
+#if defined(__arm__) || defined(__aarch64__)
int has_arm = TestCpuFlag(kCpuHasARM);
- printf("Has ARM %x\n", has_arm);
+ printf("Has ARM %d\n", has_arm);
int has_neon = TestCpuFlag(kCpuHasNEON);
- printf("Has NEON %x\n", has_neon);
+ printf("Has NEON %d\n", has_neon);
+#endif
int has_x86 = TestCpuFlag(kCpuHasX86);
- printf("Has X86 %x\n", has_x86);
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
- printf("Has SSE2 %x\n", has_sse2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
- printf("Has SSSE3 %x\n", has_ssse3);
int has_sse41 = TestCpuFlag(kCpuHasSSE41);
- printf("Has SSE4.1 %x\n", has_sse41);
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
- printf("Has SSE4.2 %x\n", has_sse42);
int has_avx = TestCpuFlag(kCpuHasAVX);
- printf("Has AVX %x\n", has_avx);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
- printf("Has AVX2 %x\n", has_avx2);
int has_erms = TestCpuFlag(kCpuHasERMS);
- printf("Has ERMS %x\n", has_erms);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
- printf("Has FMA3 %x\n", has_fma3);
- int has_avx3 = TestCpuFlag(kCpuHasAVX3);
- printf("Has AVX3 %x\n", has_avx3);
int has_f16c = TestCpuFlag(kCpuHasF16C);
- printf("Has F16C %x\n", has_f16c);
+ int has_gfni = TestCpuFlag(kCpuHasGFNI);
+ int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+ int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+ int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+ int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+ int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+ int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
+ printf("Has X86 %d\n", has_x86);
+ printf("Has SSE2 %d\n", has_sse2);
+ printf("Has SSSE3 %d\n", has_ssse3);
+ printf("Has SSE41 %d\n", has_sse41);
+ printf("Has SSE42 %d\n", has_sse42);
+ printf("Has AVX %d\n", has_avx);
+ printf("Has AVX2 %d\n", has_avx2);
+ printf("Has ERMS %d\n", has_erms);
+ printf("Has FMA3 %d\n", has_fma3);
+ printf("Has F16C %d\n", has_f16c);
+ printf("Has GFNI %d\n", has_gfni);
+ printf("Has AVX512BW %d\n", has_avx512bw);
+ printf("Has AVX512VL %d\n", has_avx512vl);
+ printf("Has AVX512VBMI %d\n", has_avx512vbmi);
+ printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2);
+ printf("Has AVX512VBITALG %d\n", has_avx512vbitalg);
+ printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq);
+
+#if defined(__mips__)
int has_mips = TestCpuFlag(kCpuHasMIPS);
- printf("Has MIPS %x\n", has_mips);
- int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
- printf("Has DSPR2 %x\n", has_dspr2);
+ printf("Has MIPS %d\n", has_mips);
int has_msa = TestCpuFlag(kCpuHasMSA);
- printf("Has MSA %x\n", has_msa);
+ printf("Has MSA %d\n", has_msa);
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ printf("Has MMI %d\n", has_mmi);
+#endif
}
TEST_F(LibYUVBaseTest, TestCpuCompilerEnabled) {
@@ -83,7 +100,7 @@
TEST_F(LibYUVBaseTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
- uint32 cpu_info[4];
+ int cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
@@ -130,6 +147,8 @@
TEST_F(LibYUVBaseTest, TestLinuxNeon) {
if (FileExists("../../unit_test/testdata/arm_v7.txt")) {
+ printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n");
+
EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt"));
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt"));
@@ -141,4 +160,27 @@
#endif
}
+TEST_F(LibYUVBaseTest, TestSetCpuFlags) {
+ // Reset any masked flags that may have been set so auto init is enabled.
+ MaskCpuFlags(0);
+
+ int original_cpu_flags = TestCpuFlag(-1);
+
+ // Test setting different CPU configurations.
+ int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized;
+ SetCpuFlags(cpu_flags);
+ EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+ cpu_flags = kCpuHasX86 | kCpuInitialized;
+ SetCpuFlags(cpu_flags);
+ EXPECT_EQ(cpu_flags, TestCpuFlag(-1));
+
+ // Test that setting 0 turns auto-init back on.
+ SetCpuFlags(0);
+ EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1));
+
+ // Restore the CPU flag mask.
+ MaskCpuFlags(benchmark_cpu_info_);
+}
+
} // namespace libyuv
diff --git a/files/unit_test/cpu_thread_test.cc b/files/unit_test/cpu_thread_test.cc
new file mode 100644
index 0000000..59061b9
--- /dev/null
+++ b/files/unit_test/cpu_thread_test.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include "libyuv/cpu_id.h"
+
+#if defined(__clang__)
+#if __has_include(<pthread.h>)
+#define LIBYUV_HAVE_PTHREAD 1
+#endif
+#elif defined(__linux__)
+#define LIBYUV_HAVE_PTHREAD 1
+#endif
+
+#ifdef LIBYUV_HAVE_PTHREAD
+#include <pthread.h>
+#endif
+
+namespace libyuv {
+
+#ifdef LIBYUV_HAVE_PTHREAD
+void* ThreadMain(void* arg) {
+ int* flags = static_cast<int*>(arg);
+
+ *flags = TestCpuFlag(kCpuHasSSSE3);
+ return nullptr;
+}
+#endif // LIBYUV_HAVE_PTHREAD
+
+// Call TestCpuFlag() from two threads. ThreadSanitizer should not report any
+// data race.
+TEST(LibYUVCpuThreadTest, TestCpuFlagMultipleThreads) {
+#ifdef LIBYUV_HAVE_PTHREAD
+ int cpu_flags1;
+ int cpu_flags2;
+ int ret;
+ pthread_t thread1;
+ pthread_t thread2;
+
+ MaskCpuFlags(0); // Reset to 0 to allow auto detect.
+ ret = pthread_create(&thread1, nullptr, ThreadMain, &cpu_flags1);
+ ASSERT_EQ(ret, 0);
+ ret = pthread_create(&thread2, nullptr, ThreadMain, &cpu_flags2);
+ ASSERT_EQ(ret, 0);
+ ret = pthread_join(thread1, nullptr);
+ EXPECT_EQ(ret, 0);
+ ret = pthread_join(thread2, nullptr);
+ EXPECT_EQ(ret, 0);
+ EXPECT_EQ(cpu_flags1, cpu_flags2);
+#else
+ printf("pthread unavailable; Test skipped.");
+#endif // LIBYUV_HAVE_PTHREAD
+}
+
+} // namespace libyuv
diff --git a/files/unit_test/math_test.cc b/files/unit_test/math_test.cc
index 2b4b57b..0abbad5 100644
--- a/files/unit_test/math_test.cc
+++ b/files/unit_test/math_test.cc
@@ -65,8 +65,8 @@
}
EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
- MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
- MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+ MemRandomize(reinterpret_cast<uint8_t*>(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast<uint8_t*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {
if (div[j] == 0) {
div[j] = 1280;
@@ -90,8 +90,8 @@
int result_opt[1280];
int result_c[1280];
- MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
- MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+ MemRandomize(reinterpret_cast<uint8_t*>(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast<uint8_t*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {
num[j] &= 4095; // Make numerator smaller.
div[j] &= 4095; // Make divisor smaller.
@@ -124,8 +124,8 @@
int result_opt[1280];
int result_c[1280];
- MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
- MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+ MemRandomize(reinterpret_cast<uint8_t*>(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast<uint8_t*>(&div[0]), sizeof(div));
for (int j = 0; j < 1280; ++j) {
num[j] &= 4095; // Make numerator smaller.
div[j] &= 4095; // Make divisor smaller.
diff --git a/files/unit_test/planar_test.cc b/files/unit_test/planar_test.cc
index 28d557a..70f8966 100644
--- a/files/unit_test/planar_test.cc
+++ b/files/unit_test/planar_test.cc
@@ -8,9 +8,13 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <math.h>
#include <stdlib.h>
#include <time.h>
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@@ -248,8 +252,8 @@
}
TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
- SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
- SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[16][16][4]);
+ SIMD_ALIGNED(int32_t added_pixels[16][16][4]);
for (int y = 0; y < 16; ++y) {
for (int x = 0; x < 16; ++x) {
@@ -274,7 +278,7 @@
}
TEST_F(LibYUVPlanarTest, TestARGBGray) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Test blue
@@ -345,8 +349,8 @@
}
TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t gray_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Test blue
@@ -417,7 +421,7 @@
}
TEST_F(LibYUVPlanarTest, TestARGBSepia) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Test blue
@@ -489,12 +493,12 @@
}
TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
// Matrix for Sepia.
- SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+ SIMD_ALIGNED(static const int8_t kRGBToSepia[]) = {
17 / 2, 68 / 2, 35 / 2, 0, 22 / 2, 88 / 2, 45 / 2, 0,
24 / 2, 98 / 2, 50 / 2, 0, 0, 0, 0, 64, // Copy alpha.
};
@@ -565,10 +569,10 @@
}
TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
// Matrix for Sepia.
- SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+ SIMD_ALIGNED(static const int8_t kRGBToSepia[]) = {
17, 68, 35, 0, 22, 88, 45, 0,
24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes.
};
@@ -625,11 +629,11 @@
}
TEST_F(LibYUVPlanarTest, TestARGBColorTable) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Matrix for Sepia.
- static const uint8 kARGBTable[256 * 4] = {
+ static const uint8_t kARGBTable[256 * 4] = {
1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
};
@@ -681,11 +685,11 @@
// Same as TestARGBColorTable except alpha does not change.
TEST_F(LibYUVPlanarTest, TestRGBColorTable) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Matrix for Sepia.
- static const uint8 kARGBTable[256 * 4] = {
+ static const uint8_t kARGBTable[256 * 4] = {
1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u,
};
@@ -736,7 +740,7 @@
}
TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
@@ -760,8 +764,8 @@
}
TEST_F(LibYUVPlanarTest, TestARGBMirror) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
@@ -783,8 +787,8 @@
}
TEST_F(LibYUVPlanarTest, TestShade) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
orig_pixels[0][0] = 10u;
@@ -841,9 +845,9 @@
}
TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
- SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
- SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
- SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels_1[1280][4]);
+ SIMD_ALIGNED(uint8_t interpolate_pixels[1280][4]);
memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
@@ -922,9 +926,9 @@
}
TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
- SIMD_ALIGNED(uint8 orig_pixels_0[1280]);
- SIMD_ALIGNED(uint8 orig_pixels_1[1280]);
- SIMD_ALIGNED(uint8 interpolate_pixels[1280]);
+ SIMD_ALIGNED(uint8_t orig_pixels_0[1280]);
+ SIMD_ALIGNED(uint8_t orig_pixels_1[1280]);
+ SIMD_ALIGNED(uint8_t interpolate_pixels[1280]);
memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
@@ -1188,7 +1192,6 @@
free_aligned_buffer_page_end(src_argb_alpha);
free_aligned_buffer_page_end(dst_argb_c);
free_aligned_buffer_page_end(dst_argb_opt);
- return;
}
TEST_F(LibYUVPlanarTest, BlendPlane_Opt) {
@@ -1282,7 +1285,6 @@
free_aligned_buffer_page_end(dst_y_opt);
free_aligned_buffer_page_end(dst_u_opt);
free_aligned_buffer_page_end(dst_v_opt);
- return;
}
TEST_F(LibYUVPlanarTest, I420Blend_Opt) {
@@ -1305,8 +1307,8 @@
}
TEST_F(LibYUVPlanarTest, TestAffine) {
- SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
- SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8_t interpolate_pixels_C[1280][4]);
for (int i = 0; i < 1280; ++i) {
for (int j = 0; j < 4; ++j) {
@@ -1323,7 +1325,7 @@
EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
#if defined(HAS_ARGBAFFINEROW_SSE2)
- SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
+ SIMD_ALIGNED(uint8_t interpolate_pixels_Opt[1280][4]);
ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
uv_step, 1280);
EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
@@ -1363,7 +1365,7 @@
// Fill destination buffers with random data.
for (i = 0; i < y_plane_size; ++i) {
- uint8 random_number = fastrand() & 0x7f;
+ uint8_t random_number = fastrand() & 0x7f;
dst_c[i] = random_number;
dst_opt[i] = dst_c[i];
}
@@ -1386,8 +1388,9 @@
}
for (i = 0; i < y_plane_size; ++i) {
- if (dst_c[i] != dst_opt[i])
+ if (dst_c[i] != dst_opt[i]) {
++err;
+ }
}
free_aligned_buffer_page_end(orig_y);
@@ -1863,12 +1866,12 @@
MaskCpuFlags(disable_cpu_flags);
ARGBBlur(src_argb_a + off, kStride, dst_argb_c, kStride,
- reinterpret_cast<int32*>(dst_cumsum), width * 4, width,
+ reinterpret_cast<int32_t*>(dst_cumsum), width * 4, width,
invert * height, radius);
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; ++i) {
ARGBBlur(src_argb_a + off, kStride, dst_argb_opt, kStride,
- reinterpret_cast<int32*>(dst_cumsum), width * 4, width,
+ reinterpret_cast<int32_t*>(dst_cumsum), width * 4, width,
invert * height, radius);
}
int max_diff = 0;
@@ -1945,9 +1948,9 @@
}
TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
@@ -2042,37 +2045,38 @@
const int y_plane_size = benchmark_width * benchmark_height * 2;
align_buffer_page_end(orig_y, y_plane_size * 3);
- uint8* dst_opt = orig_y + y_plane_size;
- uint8* dst_c = orig_y + y_plane_size * 2;
+ uint8_t* dst_opt = orig_y + y_plane_size;
+ uint8_t* dst_c = orig_y + y_plane_size * 2;
MemRandomize(orig_y, y_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
for (i = 0; i < y_plane_size / 2; ++i) {
- reinterpret_cast<uint16*>(orig_y)[i] &= mask;
+ reinterpret_cast<uint16_t*>(orig_y)[i] &= mask;
}
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
for (j = 0; j < benchmark_iterations; j++) {
- HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
- reinterpret_cast<uint16*>(dst_c), benchmark_width * 2, scale,
- benchmark_width, benchmark_height);
+ HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
+ reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
+ scale, benchmark_width, benchmark_height);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) {
- HalfFloatPlane(reinterpret_cast<uint16*>(orig_y), benchmark_width * 2,
- reinterpret_cast<uint16*>(dst_opt), benchmark_width * 2,
+ HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
+ reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
scale, benchmark_width, benchmark_height);
}
int max_diff = 0;
for (i = 0; i < y_plane_size / 2; ++i) {
- int abs_diff = abs(static_cast<int>(reinterpret_cast<uint16*>(dst_c)[i]) -
- static_cast<int>(reinterpret_cast<uint16*>(dst_opt)[i]));
+ int abs_diff =
+ abs(static_cast<int>(reinterpret_cast<uint16_t*>(dst_c)[i]) -
+ static_cast<int>(reinterpret_cast<uint16_t*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@@ -2164,10 +2168,56 @@
EXPECT_LE(diff, 1);
}
+float TestByteToFloat(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info,
+ float scale) {
+ int i, j;
+ const int y_plane_size = benchmark_width * benchmark_height;
+
+ align_buffer_page_end(orig_y, y_plane_size * (1 + 4 + 4));
+ float* dst_opt = reinterpret_cast<float*>(orig_y + y_plane_size);
+ float* dst_c = reinterpret_cast<float*>(orig_y + y_plane_size * 5);
+
+ MemRandomize(orig_y, y_plane_size);
+ memset(dst_c, 0, y_plane_size * 4);
+ memset(dst_opt, 1, y_plane_size * 4);
+
+ // Disable all optimizations.
+ MaskCpuFlags(disable_cpu_flags);
+ ByteToFloat(orig_y, dst_c, scale, y_plane_size);
+
+ // Enable optimizations.
+ MaskCpuFlags(benchmark_cpu_info);
+ for (j = 0; j < benchmark_iterations; j++) {
+ ByteToFloat(orig_y, dst_opt, scale, y_plane_size);
+ }
+
+ float max_diff = 0;
+ for (i = 0; i < y_plane_size; ++i) {
+ float abs_diff = fabs(dst_c[i] - dst_opt[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestByteToFloat) {
+ float diff = TestByteToFloat(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_, 1.0f);
+ EXPECT_EQ(0.f, diff);
+}
+
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
- SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
- SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
align_buffer_page_end(lumacolortable, 32768);
@@ -2339,7 +2389,7 @@
}
const int kStride = width * bpp;
const int kSize = kStride * height;
- const uint32 v32 = fastrand() & (bpp == 4 ? 0xffffffff : 0xff);
+ const uint32_t v32 = fastrand() & (bpp == 4 ? 0xffffffff : 0xff);
align_buffer_page_end(dst_argb_c, kSize + off);
align_buffer_page_end(dst_argb_opt, kSize + off);
@@ -2518,4 +2568,805 @@
free_aligned_buffer_page_end(dst_pixels_c);
}
+TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 3);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 3);
+ align_buffer_page_end(dst_pixels_c, kPixels * 3);
+
+ MemRandomize(src_pixels, kPixels * 3);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 3);
+ MemRandomize(dst_pixels_c, kPixels * 3);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, dst_pixels_c,
+ benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
+ benchmark_width_, tmp_pixels_b, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 3, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 3; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels, kPixels * 3);
+ align_buffer_page_end(tmp_pixels_r, kPixels);
+ align_buffer_page_end(tmp_pixels_g, kPixels);
+ align_buffer_page_end(tmp_pixels_b, kPixels);
+ align_buffer_page_end(dst_pixels_opt, kPixels * 3);
+ align_buffer_page_end(dst_pixels_c, kPixels * 3);
+
+ MemRandomize(src_pixels, kPixels * 3);
+ MemRandomize(tmp_pixels_r, kPixels);
+ MemRandomize(tmp_pixels_g, kPixels);
+ MemRandomize(tmp_pixels_b, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 3);
+ MemRandomize(dst_pixels_c, kPixels * 3);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
+ benchmark_width_, benchmark_width_, benchmark_height_);
+ MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, dst_pixels_c,
+ benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r,
+ benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, benchmark_width_,
+ benchmark_height_);
+ }
+ MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
+ tmp_pixels_b, benchmark_width_, dst_pixels_opt,
+ benchmark_width_ * 3, benchmark_width_, benchmark_height_);
+
+ for (int i = 0; i < kPixels * 3; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels);
+ free_aligned_buffer_page_end(tmp_pixels_r);
+ free_aligned_buffer_page_end(tmp_pixels_g);
+ free_aligned_buffer_page_end(tmp_pixels_b);
+ free_aligned_buffer_page_end(dst_pixels_opt);
+ free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+// TODO(fbarchard): improve test for platforms and cpu detect
+#ifdef HAS_MERGEUVROW_16_AVX2
+TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_u, kPixels * 2);
+ align_buffer_page_end(src_pixels_v, kPixels * 2);
+ align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
+ align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2);
+
+ MemRandomize(src_pixels_u, kPixels * 2);
+ MemRandomize(src_pixels_v, kPixels * 2);
+ memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2);
+ memset(dst_pixels_uv_c, 1, kPixels * 2 * 2);
+
+ MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
+ reinterpret_cast<const uint16_t*>(src_pixels_v),
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_c), 64, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ MergeUVRow_16_AVX2(reinterpret_cast<const uint16_t*>(src_pixels_u),
+ reinterpret_cast<const uint16_t*>(src_pixels_v),
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+ kPixels);
+ } else {
+ MergeUVRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_u),
+ reinterpret_cast<const uint16_t*>(src_pixels_v),
+ reinterpret_cast<uint16_t*>(dst_pixels_uv_opt), 64,
+ kPixels);
+ }
+ }
+
+ for (int i = 0; i < kPixels * 2 * 2; ++i) {
+ EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_u);
+ free_aligned_buffer_page_end(src_pixels_v);
+ free_aligned_buffer_page_end(dst_pixels_uv_opt);
+ free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+#endif
+
+// TODO(fbarchard): Improve test for more platforms.
+#ifdef HAS_MULTIPLYROW_16_AVX2
+TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ memset(dst_pixels_y_opt, 0, kPixels * 2);
+ memset(dst_pixels_y_c, 1, kPixels * 2);
+
+ MultiplyRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ reinterpret_cast<uint16_t*>(dst_pixels_y_c), 64, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ MultiplyRow_16_AVX2(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt), 64,
+ kPixels);
+ } else {
+ MultiplyRow_16_C(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt), 64,
+ kPixels);
+ }
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif // HAS_MULTIPLYROW_16_AVX2
+
+TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels);
+ align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ memset(dst_pixels_y_opt, 0, kPixels);
+ memset(dst_pixels_y_c, 1, kPixels);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ Convert16To8Plane(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ benchmark_width_, dst_pixels_y_c, benchmark_width_, 16384,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ Convert16To8Plane(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ benchmark_width_, dst_pixels_y_opt, benchmark_width_,
+ 16384, benchmark_width_, benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+// TODO(fbarchard): Improve test for more platforms.
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
+ // AVX2 does multiple of 32, so round count up
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+ align_buffer_page_end(src_pixels_y, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels);
+ align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+ MemRandomize(src_pixels_y, kPixels * 2);
+ // clamp source range to 10 bits.
+ for (int i = 0; i < kPixels; ++i) {
+ reinterpret_cast<uint16_t*>(src_pixels_y)[i] &= 1023;
+ }
+
+ memset(dst_pixels_y_opt, 0, kPixels);
+ memset(dst_pixels_y_c, 1, kPixels);
+
+ Convert16To8Row_C(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ dst_pixels_y_c, 16384, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ Convert16To8Row_AVX2(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ dst_pixels_y_opt, 16384, kPixels);
+ } else if (has_ssse3) {
+ Convert16To8Row_SSSE3(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ dst_pixels_y_opt, 16384, kPixels);
+ } else {
+ Convert16To8Row_C(reinterpret_cast<const uint16_t*>(src_pixels_y),
+ dst_pixels_y_opt, 16384, kPixels);
+ }
+ }
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif // HAS_CONVERT16TO8ROW_AVX2
+
+TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_y, kPixels);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
+
+ MemRandomize(src_pixels_y, kPixels);
+ memset(dst_pixels_y_opt, 0, kPixels * 2);
+ memset(dst_pixels_y_c, 1, kPixels * 2);
+
+ MaskCpuFlags(disable_cpu_flags_);
+ Convert8To16Plane(src_pixels_y, benchmark_width_,
+ reinterpret_cast<uint16_t*>(dst_pixels_y_c),
+ benchmark_width_, 1024, benchmark_width_,
+ benchmark_height_);
+ MaskCpuFlags(benchmark_cpu_info_);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ Convert8To16Plane(src_pixels_y, benchmark_width_,
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt),
+ benchmark_width_, 1024, benchmark_width_,
+ benchmark_height_);
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
+// TODO(fbarchard): Improve test for more platforms.
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
+ const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
+ align_buffer_page_end(src_pixels_y, kPixels);
+ align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
+ align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
+
+ MemRandomize(src_pixels_y, kPixels);
+ memset(dst_pixels_y_opt, 0, kPixels * 2);
+ memset(dst_pixels_y_c, 1, kPixels * 2);
+
+ Convert8To16Row_C(src_pixels_y, reinterpret_cast<uint16_t*>(dst_pixels_y_c),
+ 1024, kPixels);
+
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ if (has_avx2) {
+ Convert8To16Row_AVX2(src_pixels_y,
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt), 1024,
+ kPixels);
+ } else if (has_sse2) {
+ Convert8To16Row_SSE2(src_pixels_y,
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt), 1024,
+ kPixels);
+ } else {
+ Convert8To16Row_C(src_pixels_y,
+ reinterpret_cast<uint16_t*>(dst_pixels_y_opt), 1024,
+ kPixels);
+ }
+ }
+
+ for (int i = 0; i < kPixels * 2; ++i) {
+ EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_y);
+ free_aligned_buffer_page_end(dst_pixels_y_opt);
+ free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+#endif // HAS_CONVERT8TO16ROW_AVX2
+
+float TestScaleMaxSamples(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ float scale,
+ bool opt) {
+ int i, j;
+ float max_c, max_opt = 0.f;
+ // NEON does multiple of 8, so round count up
+ const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
+ align_buffer_page_end(orig_y, kPixels * 4 * 3 + 48);
+ uint8_t* dst_c = orig_y + kPixels * 4 + 16;
+ uint8_t* dst_opt = orig_y + kPixels * 4 * 2 + 32;
+
+ // Randomize works but may contain some denormals affecting performance.
+ // MemRandomize(orig_y, kPixels * 4);
+ // large values are problematic. audio is really -1 to 1.
+ for (i = 0; i < kPixels; ++i) {
+ (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
+ }
+ memset(dst_c, 0, kPixels * 4);
+ memset(dst_opt, 1, kPixels * 4);
+
+ max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_c), scale, kPixels);
+
+ for (j = 0; j < benchmark_iterations; j++) {
+ if (opt) {
+#ifdef HAS_SCALESUMSAMPLES_NEON
+ max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale,
+ kPixels);
+#else
+ max_opt =
+ ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+#endif
+ } else {
+ max_opt =
+ ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+ }
+ }
+
+ float max_diff = FAbs(max_opt - max_c);
+ for (i = 0; i < kPixels; ++i) {
+ float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+ (reinterpret_cast<float*>(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_C) {
+ float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, false);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_Opt) {
+ float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, true);
+ EXPECT_EQ(0, diff);
+}
+
+float TestScaleSumSamples(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ float scale,
+ bool opt) {
+ int i, j;
+ float sum_c, sum_opt = 0.f;
+ // NEON does multiple of 8, so round count up
+ const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
+ align_buffer_page_end(orig_y, kPixels * 4 * 3);
+ uint8_t* dst_c = orig_y + kPixels * 4;
+ uint8_t* dst_opt = orig_y + kPixels * 4 * 2;
+
+ // Randomize works but may contain some denormals affecting performance.
+ // MemRandomize(orig_y, kPixels * 4);
+ // large values are problematic. audio is really -1 to 1.
+ for (i = 0; i < kPixels; ++i) {
+ (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
+ }
+ memset(dst_c, 0, kPixels * 4);
+ memset(dst_opt, 1, kPixels * 4);
+
+ sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_c), scale, kPixels);
+
+ for (j = 0; j < benchmark_iterations; j++) {
+ if (opt) {
+#ifdef HAS_SCALESUMSAMPLES_NEON
+ sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale,
+ kPixels);
+#else
+ sum_opt =
+ ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+#endif
+ } else {
+ sum_opt =
+ ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+ }
+ }
+
+ float mse_opt = sum_opt / kPixels * 4;
+ float mse_c = sum_c / kPixels * 4;
+ float mse_error = FAbs(mse_opt - mse_c) / mse_c;
+
+ // If the sum of a float is more than 4 million, small adds are round down on
+ // float and produce different results with vectorized sum vs scalar sum.
+ // Ignore the difference if the sum is large.
+ float max_diff = 0.f;
+ if (mse_error > 0.0001 && sum_c < 4000000) { // allow .01% difference of mse
+ max_diff = mse_error;
+ }
+
+ for (i = 0; i < kPixels; ++i) {
+ float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+ (reinterpret_cast<float*>(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
+ float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, false);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
+ float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, true);
+ EXPECT_EQ(0, diff);
+}
+
+float TestScaleSamples(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ float scale,
+ bool opt) {
+ int i, j;
+ // NEON does multiple of 8, so round count up
+ const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
+ align_buffer_page_end(orig_y, kPixels * 4 * 3);
+ uint8_t* dst_c = orig_y + kPixels * 4;
+ uint8_t* dst_opt = orig_y + kPixels * 4 * 2;
+
+ // Randomize works but may contain some denormals affecting performance.
+ // MemRandomize(orig_y, kPixels * 4);
+ // large values are problematic. audio is really -1 to 1.
+ for (i = 0; i < kPixels; ++i) {
+ (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
+ }
+ memset(dst_c, 0, kPixels * 4);
+ memset(dst_opt, 1, kPixels * 4);
+
+ ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_c), scale, kPixels);
+
+ for (j = 0; j < benchmark_iterations; j++) {
+ if (opt) {
+#ifdef HAS_SCALESUMSAMPLES_NEON
+ ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+#else
+ ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+#endif
+ } else {
+ ScaleSamples_C(reinterpret_cast<float*>(orig_y),
+ reinterpret_cast<float*>(dst_opt), scale, kPixels);
+ }
+ }
+
+ float max_diff = 0.f;
+ for (i = 0; i < kPixels; ++i) {
+ float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+ (reinterpret_cast<float*>(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
+ float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, false);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
+ float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, true);
+ EXPECT_EQ(0, diff);
+}
+
+float TestCopySamples(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ bool opt) {
+ int i, j;
+ // NEON does multiple of 16 floats, so round count up
+ const int kPixels = (benchmark_width * benchmark_height + 15) & ~15;
+ align_buffer_page_end(orig_y, kPixels * 4 * 3);
+ uint8_t* dst_c = orig_y + kPixels * 4;
+ uint8_t* dst_opt = orig_y + kPixels * 4 * 2;
+
+ // Randomize works but may contain some denormals affecting performance.
+ // MemRandomize(orig_y, kPixels * 4);
+ // large values are problematic. audio is really -1 to 1.
+ for (i = 0; i < kPixels; ++i) {
+ (reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
+ }
+ memset(dst_c, 0, kPixels * 4);
+ memset(dst_opt, 1, kPixels * 4);
+
+ memcpy(reinterpret_cast<void*>(dst_c), reinterpret_cast<void*>(orig_y),
+ kPixels * 4);
+
+ for (j = 0; j < benchmark_iterations; j++) {
+ if (opt) {
+#ifdef HAS_COPYROW_NEON
+ CopyRow_NEON(orig_y, dst_opt, kPixels * 4);
+#else
+ CopyRow_C(orig_y, dst_opt, kPixels * 4);
+#endif
+ } else {
+ CopyRow_C(orig_y, dst_opt, kPixels * 4);
+ }
+ }
+
+ float max_diff = 0.f;
+ for (i = 0; i < kPixels; ++i) {
+ float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
+ (reinterpret_cast<float*>(dst_opt)[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(orig_y);
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestCopySamples_C) {
+ float diff = TestCopySamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, false);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) {
+ float diff = TestCopySamples(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, true);
+ EXPECT_EQ(0, diff);
+}
+
+extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
+extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
+
+TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
+ SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 640 + 4; ++i) {
+ orig_pixels[i] = i * 256;
+ }
+ GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
+ for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ } else {
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+ }
+#else
+ GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
+#endif
+ }
+
+ for (int i = 0; i < 640; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ EXPECT_EQ(dst_pixels_c[0],
+ static_cast<uint16_t>(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1));
+ EXPECT_EQ(dst_pixels_c[639], static_cast<uint16_t>(10256));
+}
+
+extern "C" void GaussCol_NEON(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width);
+
+extern "C" void GaussCol_C(const uint16_t* src0,
+ const uint16_t* src1,
+ const uint16_t* src2,
+ const uint16_t* src3,
+ const uint16_t* src4,
+ uint32_t* dst,
+ int width);
+
+TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
+ SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
+ SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
+ SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 640 * 5; ++i) {
+ orig_pixels[i] = i;
+ }
+ GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+ &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
+ 640);
+ for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+ &orig_pixels[640 * 3], &orig_pixels[640 * 4],
+ &dst_pixels_opt[0], 640);
+ } else {
+ GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+ &orig_pixels[640 * 3], &orig_pixels[640 * 4],
+ &dst_pixels_opt[0], 640);
+ }
+#else
+ GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
+ &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
+ 640);
+#endif
+ }
+
+ for (int i = 0; i < 640; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ EXPECT_EQ(dst_pixels_c[0],
+ static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
+ 640 * 4 * 1));
+ EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
+}
+
+float TestFloatDivToByte(int benchmark_width,
+ int benchmark_height,
+ int benchmark_iterations,
+ float scale,
+ bool opt) {
+ int i, j;
+ // NEON does multiple of 8, so round count up
+ const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
+ align_buffer_page_end(src_weights, kPixels * 4);
+ align_buffer_page_end(src_values, kPixels * 4);
+ align_buffer_page_end(dst_out_c, kPixels);
+ align_buffer_page_end(dst_out_opt, kPixels);
+ align_buffer_page_end(dst_mask_c, kPixels);
+ align_buffer_page_end(dst_mask_opt, kPixels);
+
+ // Randomize works but may contain some denormals affecting performance.
+ // MemRandomize(orig_y, kPixels * 4);
+ // large values are problematic. audio is really -1 to 1.
+ for (i = 0; i < kPixels; ++i) {
+ (reinterpret_cast<float*>(src_weights))[i] = scale;
+ (reinterpret_cast<float*>(src_values))[i] =
+ sinf(static_cast<float>(i) * 0.1f);
+ }
+ memset(dst_out_c, 0, kPixels);
+ memset(dst_out_opt, 1, kPixels);
+ memset(dst_mask_c, 2, kPixels);
+ memset(dst_mask_opt, 3, kPixels);
+
+ FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+ reinterpret_cast<float*>(src_values), dst_out_c,
+ dst_mask_c, kPixels);
+
+ for (j = 0; j < benchmark_iterations; j++) {
+ if (opt) {
+#ifdef HAS_FLOATDIVTOBYTEROW_NEON
+ FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
+ reinterpret_cast<float*>(src_values), dst_out_opt,
+ dst_mask_opt, kPixels);
+#else
+ FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+ reinterpret_cast<float*>(src_values), dst_out_opt,
+ dst_mask_opt, kPixels);
+#endif
+ } else {
+ FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
+ reinterpret_cast<float*>(src_values), dst_out_opt,
+ dst_mask_opt, kPixels);
+ }
+ }
+
+ uint8_t max_diff = 0;
+ for (i = 0; i < kPixels; ++i) {
+ uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) +
+ abs(dst_mask_c[i] - dst_mask_opt[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(src_weights);
+ free_aligned_buffer_page_end(src_values);
+ free_aligned_buffer_page_end(dst_out_c);
+ free_aligned_buffer_page_end(dst_out_opt);
+ free_aligned_buffer_page_end(dst_mask_c);
+ free_aligned_buffer_page_end(dst_mask_opt);
+
+ return max_diff;
+}
+
+TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) {
+ float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, false);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
+ float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, 1.2f, true);
+ EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, UVToVURow) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_page_end(src_pixels_vu, kPixels * 2);
+ align_buffer_page_end(dst_pixels_uv, kPixels * 2);
+
+ MemRandomize(src_pixels_vu, kPixels * 2);
+ memset(dst_pixels_uv, 1, kPixels * 2);
+
+ UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
+
+ for (int i = 0; i < kPixels; ++i) {
+ EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+ EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+ }
+
+ free_aligned_buffer_page_end(src_pixels_vu);
+ free_aligned_buffer_page_end(dst_pixels_uv);
+}
+
} // namespace libyuv
diff --git a/files/unit_test/rotate_test.cc b/files/unit_test/rotate_test.cc
index d04b96e..61941e6 100644
--- a/files/unit_test/rotate_test.cc
+++ b/files/unit_test/rotate_test.cc
@@ -135,6 +135,123 @@
benchmark_cpu_info_);
}
+static void I444TestRotate(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height == 0) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i444_y_size = src_width * Abs(src_height);
+ int src_i444_uv_size = src_width * Abs(src_height);
+ int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+ align_buffer_page_end(src_i444, src_i444_size);
+ for (int i = 0; i < src_i444_size; ++i) {
+ src_i444[i] = fastrand() & 0xff;
+ }
+
+ int dst_i444_y_size = dst_width * dst_height;
+ int dst_i444_uv_size = dst_width * dst_height;
+ int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+ align_buffer_page_end(dst_i444_c, dst_i444_size);
+ align_buffer_page_end(dst_i444_opt, dst_i444_size);
+ memset(dst_i444_c, 2, dst_i444_size);
+ memset(dst_i444_opt, 3, dst_i444_size);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+ dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+ src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+ dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+ dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+ dst_width, src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i444_size; ++i) {
+ EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+ }
+
+ free_aligned_buffer_page_end(dst_i444_c);
+ free_aligned_buffer_page_end(dst_i444_opt);
+ free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate0, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate90, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+ benchmark_height_, kRotate180, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+ I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+ benchmark_width_, kRotate270, benchmark_iterations_,
+ disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+ I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+ benchmark_iterations_, disable_cpu_flags_,
+ benchmark_cpu_info_);
+}
+
static void NV12TestRotate(int src_width,
int src_height,
int dst_width,
diff --git a/files/unit_test/scale_argb_test.cc b/files/unit_test/scale_argb_test.cc
index d11aec2..94aef60 100644
--- a/files/unit_test/scale_argb_test.cc
+++ b/files/unit_test/scale_argb_test.cc
@@ -37,7 +37,7 @@
int i, j;
const int b = 0; // 128 to test for padding/stride.
- int64 src_argb_plane_size =
+ int64_t src_argb_plane_size =
(Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
@@ -48,7 +48,8 @@
}
MemRandomize(src_argb, src_argb_plane_size);
- int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
+ int64_t dst_argb_plane_size =
+ (dst_width + b * 2) * (dst_height + b * 2) * 4LL;
int dst_stride_argb = (b * 2 + dst_width) * 4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
@@ -116,11 +117,11 @@
static const int kTileX = 8;
static const int kTileY = 8;
-static int TileARGBScale(const uint8* src_argb,
+static int TileARGBScale(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -157,7 +158,7 @@
}
const int b = 128;
- int64 src_argb_plane_size =
+ int64_t src_argb_plane_size =
(Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4;
int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
@@ -168,7 +169,7 @@
}
memset(src_argb, 1, src_argb_plane_size);
- int64 dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int64_t dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
int dst_stride_argb = (b * 2 + dst_width) * 4;
int i, j;
@@ -302,27 +303,28 @@
TEST_SCALETO(ARGBScale, 1, 1)
TEST_SCALETO(ARGBScale, 320, 240)
-TEST_SCALETO(ARGBScale, 352, 288)
TEST_SCALETO(ARGBScale, 569, 480)
TEST_SCALETO(ARGBScale, 640, 360)
TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
// Scale with YUV conversion to ARGB and clipping.
+// TODO(fbarchard): Add fourcc support. All 4 ARGB formats is easy to support.
LIBYUV_API
-int YUVToARGBScaleReference2(const uint8* src_y,
+int YUVToARGBScaleReference2(const uint8_t* src_y,
int src_stride_y,
- const uint8* src_u,
+ const uint8_t* src_u,
int src_stride_u,
- const uint8* src_v,
+ const uint8_t* src_v,
int src_stride_v,
- uint32 /* src_fourcc */, // TODO: Add support.
+ uint32_t /* src_fourcc */,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
- uint32 /* dst_fourcc */, // TODO: Add support.
+ uint32_t /* dst_fourcc */,
int dst_width,
int dst_height,
int clip_x,
@@ -330,7 +332,8 @@
int clip_width,
int clip_height,
enum FilterMode filtering) {
- uint8* argb_buffer = static_cast<uint8*>(malloc(src_width * src_height * 4));
+ uint8_t* argb_buffer =
+ static_cast<uint8_t*>(malloc(src_width * src_height * 4));
int r;
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height);
@@ -342,7 +345,12 @@
return r;
}
-static void FillRamp(uint8* buf, int width, int height, int v, int dx, int dy) {
+static void FillRamp(uint8_t* buf,
+ int width,
+ int height,
+ int v,
+ int dx,
+ int dy) {
int rv = v;
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
@@ -369,8 +377,8 @@
int dst_height,
FilterMode f,
int benchmark_iterations) {
- int64 src_y_plane_size = Abs(src_width) * Abs(src_height);
- int64 src_uv_plane_size =
+ int64_t src_y_plane_size = Abs(src_width) * Abs(src_height);
+ int64_t src_uv_plane_size =
((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2);
int src_stride_y = Abs(src_width);
int src_stride_uv = (Abs(src_width) + 1) / 2;
@@ -379,7 +387,7 @@
align_buffer_page_end(src_u, src_uv_plane_size);
align_buffer_page_end(src_v, src_uv_plane_size);
- int64 dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
+ int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL;
int dst_stride_argb = (dst_width)*4;
align_buffer_page_end(dst_argb_c, dst_argb_plane_size);
align_buffer_page_end(dst_argb_opt, dst_argb_plane_size);
diff --git a/files/unit_test/scale_test.cc b/files/unit_test/scale_test.cc
index 0b4ec30..811b2d0 100644
--- a/files/unit_test/scale_test.cc
+++ b/files/unit_test/scale_test.cc
@@ -14,6 +14,7 @@
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
+#include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C
#define STRINGIZE(line) #line
#define FILELINESTR(file, line) file ":" STRINGIZE(line)
@@ -21,32 +22,32 @@
namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int TestFilter(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- FilterMode f,
- int benchmark_iterations,
- int disable_cpu_flags,
- int benchmark_cpu_info) {
+static int I420TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
int i, j;
- const int b = 0; // 128 to test for padding/stride.
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
- int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
- int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
- int src_stride_y = b * 2 + Abs(src_width);
- int src_stride_uv = b * 2 + src_width_uv;
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
- align_buffer_page_end(src_y, src_y_plane_size)
- align_buffer_page_end(src_u, src_uv_plane_size) align_buffer_page_end(
- src_v, src_uv_plane_size) if (!src_y || !src_u || !src_v) {
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ if (!src_y || !src_u || !src_v) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
@@ -57,60 +58,51 @@
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
- int64 dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
- int64 dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+ int64_t dst_y_plane_size = (dst_width) * (dst_height);
+ int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
- int dst_stride_y = b * 2 + dst_width;
- int dst_stride_uv = b * 2 + dst_width_uv;
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
- align_buffer_page_end(dst_y_c, dst_y_plane_size)
- align_buffer_page_end(dst_u_c, dst_uv_plane_size)
- align_buffer_page_end(dst_v_c, dst_uv_plane_size)
- align_buffer_page_end(dst_y_opt, dst_y_plane_size)
- align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
- align_buffer_page_end(
- dst_v_opt,
- dst_uv_plane_size) if (!dst_y_c || !dst_u_c ||
- !dst_v_c || !dst_y_opt ||
- !dst_u_opt || !dst_v_opt) {
+ align_buffer_page_end(dst_y_c, dst_y_plane_size);
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_opt, dst_uv_plane_size);
+ if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt ||
+ !dst_v_opt) {
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
return 0;
}
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
double c_time = get_time();
- I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
- src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
- src_height, dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
- dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
- dst_height, f);
+ I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_c, dst_stride_y, dst_u_c,
+ dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < benchmark_iterations; ++i) {
- I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
- src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
- src_height, dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
- dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
- dst_height, f);
+ I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt,
+ dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height,
+ f);
}
opt_time = (get_time() - opt_time) / benchmark_iterations;
- // Report performance of C vs OPT
+ // Report performance of C vs OPT.
printf("filter %d - %8d us C - %8d us OPT\n", f,
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
// C version may be a little off from the optimized. Order of
// operations may introduce rounding somewhere. So do a difference
- // of the buffers and look to see that the max difference isn't
- // over 2.
+ // of the buffers and look to see that the max difference is not
+ // over 3.
int max_diff = 0;
- for (i = b; i < (dst_height + b); ++i) {
- for (j = b; j < (dst_width + b); ++j) {
+ for (i = 0; i < (dst_height); ++i) {
+ for (j = 0; j < (dst_width); ++j) {
int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
dst_y_opt[(i * dst_stride_y) + j]);
if (abs_diff > max_diff) {
@@ -119,8 +111,8 @@
}
}
- for (i = b; i < (dst_height_uv + b); ++i) {
- for (j = b; j < (dst_width_uv + b); ++j) {
+ for (i = 0; i < (dst_height_uv); ++i) {
+ for (j = 0; j < (dst_width_uv); ++j) {
int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
dst_u_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
@@ -134,170 +126,408 @@
}
}
- free_aligned_buffer_page_end(dst_y_c) free_aligned_buffer_page_end(dst_u_c)
- free_aligned_buffer_page_end(dst_v_c)
- free_aligned_buffer_page_end(dst_y_opt)
- free_aligned_buffer_page_end(dst_u_opt)
- free_aligned_buffer_page_end(dst_v_opt)
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_u_c);
+ free_aligned_buffer_page_end(dst_v_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
- free_aligned_buffer_page_end(src_y)
- free_aligned_buffer_page_end(src_u)
- free_aligned_buffer_page_end(src_v)
-
- return max_diff;
+ return max_diff;
}
// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
// 0 = exact.
-static int TestFilter_16(int src_width,
- int src_height,
- int dst_width,
- int dst_height,
- FilterMode f,
- int benchmark_iterations) {
+static int I420TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
- int i, j;
- const int b = 0; // 128 to test for padding/stride.
+ int i;
int src_width_uv = (Abs(src_width) + 1) >> 1;
int src_height_uv = (Abs(src_height) + 1) >> 1;
- int64 src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
- int64 src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
- int src_stride_y = b * 2 + Abs(src_width);
- int src_stride_uv = b * 2 + src_width_uv;
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
- align_buffer_page_end(src_y, src_y_plane_size) align_buffer_page_end(
- src_u, src_uv_plane_size) align_buffer_page_end(src_v, src_uv_plane_size)
- align_buffer_page_end(src_y_16, src_y_plane_size * 2)
- align_buffer_page_end(src_u_16, src_uv_plane_size * 2)
- align_buffer_page_end(src_v_16, src_uv_plane_size * 2)
- uint16* p_src_y_16 = reinterpret_cast<uint16*>(src_y_16);
- uint16* p_src_u_16 = reinterpret_cast<uint16*>(src_u_16);
- uint16* p_src_v_16 = reinterpret_cast<uint16*>(src_v_16);
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+ align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+ align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+ if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+ uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+ uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
MemRandomize(src_y, src_y_plane_size);
MemRandomize(src_u, src_uv_plane_size);
MemRandomize(src_v, src_uv_plane_size);
- for (i = b; i < src_height + b; ++i) {
- for (j = b; j < src_width + b; ++j) {
- p_src_y_16[(i * src_stride_y) + j] = src_y[(i * src_stride_y) + j];
- }
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_16[i] = src_y[i];
}
-
- for (i = b; i < (src_height_uv + b); ++i) {
- for (j = b; j < (src_width_uv + b); ++j) {
- p_src_u_16[(i * src_stride_uv) + j] = src_u[(i * src_stride_uv) + j];
- p_src_v_16[(i * src_stride_uv) + j] = src_v[(i * src_stride_uv) + j];
- }
+ for (i = 0; i < src_uv_plane_size; ++i) {
+ p_src_u_16[i] = src_u[i];
+ p_src_v_16[i] = src_v[i];
}
int dst_width_uv = (dst_width + 1) >> 1;
int dst_height_uv = (dst_height + 1) >> 1;
- int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
- int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+ int dst_y_plane_size = (dst_width) * (dst_height);
+ int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
- int dst_stride_y = b * 2 + dst_width;
- int dst_stride_uv = b * 2 + dst_width_uv;
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
- align_buffer_page_end(dst_y_8, dst_y_plane_size)
- align_buffer_page_end(dst_u_8, dst_uv_plane_size)
- align_buffer_page_end(dst_v_8, dst_uv_plane_size)
- align_buffer_page_end(dst_y_16, dst_y_plane_size * 2)
- align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2)
- align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2)
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+ align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+ align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
- uint16* p_dst_y_16 =
- reinterpret_cast<uint16*>(dst_y_16);
- uint16* p_dst_u_16 = reinterpret_cast<uint16*>(dst_u_16);
- uint16* p_dst_v_16 = reinterpret_cast<uint16*>(dst_v_16);
+ uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+ uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+ uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
- I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
- src_u + (src_stride_uv * b) + b, src_stride_uv,
- src_v + (src_stride_uv * b) + b, src_stride_uv, src_width,
- src_height, dst_y_8 + (dst_stride_y * b) + b, dst_stride_y,
- dst_u_8 + (dst_stride_uv * b) + b, dst_stride_uv,
- dst_v_8 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
- dst_height, f);
-
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+ dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
for (i = 0; i < benchmark_iterations; ++i) {
- I420Scale_16(p_src_y_16 + (src_stride_y * b) + b, src_stride_y,
- p_src_u_16 + (src_stride_uv * b) + b, src_stride_uv,
- p_src_v_16 + (src_stride_uv * b) + b, src_stride_uv, src_width,
- src_height, p_dst_y_16 + (dst_stride_y * b) + b, dst_stride_y,
- p_dst_u_16 + (dst_stride_uv * b) + b, dst_stride_uv,
- p_dst_v_16 + (dst_stride_uv * b) + b, dst_stride_uv, dst_width,
- dst_height, f);
+ I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+ p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+ dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+ dst_stride_uv, dst_width, dst_height, f);
}
- // Expect an exact match
+ // Expect an exact match.
int max_diff = 0;
- for (i = b; i < (dst_height + b); ++i) {
- for (j = b; j < (dst_width + b); ++j) {
- int abs_diff = Abs(dst_y_8[(i * dst_stride_y) + j] -
- p_dst_y_16[(i * dst_stride_y) + j]);
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_u_8);
+ free_aligned_buffer_page_end(dst_v_8);
+ free_aligned_buffer_page_end(dst_y_16);
+ free_aligned_buffer_page_end(dst_u_16);
+ free_aligned_buffer_page_end(dst_v_16);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ free_aligned_buffer_page_end(src_y_16);
+ free_aligned_buffer_page_end(src_u_16);
+ free_aligned_buffer_page_end(src_v_16);
+
+ return max_diff;
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int I444TestFilter(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i, j;
+ int src_width_uv = Abs(src_width);
+ int src_height_uv = Abs(src_height);
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ if (!src_y || !src_u || !src_v) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ int dst_width_uv = dst_width;
+ int dst_height_uv = dst_height;
+
+ int64_t dst_y_plane_size = (dst_width) * (dst_height);
+ int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size);
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_opt, dst_uv_plane_size);
+ if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt ||
+ !dst_v_opt) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ double c_time = get_time();
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_c, dst_stride_y, dst_u_c,
+ dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt,
+ dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height,
+ f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT.
+ printf("filter %d - %8d us C - %8d us OPT\n", f,
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference is not
+ // over 3.
+ int max_diff = 0;
+ for (i = 0; i < (dst_height); ++i) {
+ for (j = 0; j < (dst_width); ++j) {
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
- for (i = b; i < (dst_height_uv + b); ++i) {
- for (j = b; j < (dst_width_uv + b); ++j) {
- int abs_diff = Abs(dst_u_8[(i * dst_stride_uv) + j] -
- p_dst_u_16[(i * dst_stride_uv) + j]);
+ for (i = 0; i < (dst_height_uv); ++i) {
+ for (j = 0; j < (dst_width_uv); ++j) {
+ int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+ dst_u_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
- abs_diff = Abs(dst_v_8[(i * dst_stride_uv) + j] -
- p_dst_v_16[(i * dst_stride_uv) + j]);
+ abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+ dst_v_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
}
- free_aligned_buffer_page_end(dst_y_8) free_aligned_buffer_page_end(dst_u_8)
- free_aligned_buffer_page_end(dst_v_8)
- free_aligned_buffer_page_end(dst_y_16)
- free_aligned_buffer_page_end(dst_u_16)
- free_aligned_buffer_page_end(dst_v_16)
+ free_aligned_buffer_page_end(dst_y_c);
+ free_aligned_buffer_page_end(dst_u_c);
+ free_aligned_buffer_page_end(dst_v_c);
+ free_aligned_buffer_page_end(dst_y_opt);
+ free_aligned_buffer_page_end(dst_u_opt);
+ free_aligned_buffer_page_end(dst_v_opt);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
- free_aligned_buffer_page_end(src_y)
- free_aligned_buffer_page_end(src_u)
- free_aligned_buffer_page_end(src_v)
- free_aligned_buffer_page_end(src_y_16)
- free_aligned_buffer_page_end(src_u_16)
- free_aligned_buffer_page_end(src_v_16)
+ return max_diff;
+}
- return max_diff;
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int src_width_uv = Abs(src_width);
+ int src_height_uv = Abs(src_height);
+
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+ int src_stride_y = Abs(src_width);
+ int src_stride_uv = src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_u, src_uv_plane_size);
+ align_buffer_page_end(src_v, src_uv_plane_size);
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+ align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+ align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+ if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+ printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+ return 0;
+ }
+ uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+ uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+ uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_16[i] = src_y[i];
+ }
+ for (i = 0; i < src_uv_plane_size; ++i) {
+ p_src_u_16[i] = src_u[i];
+ p_src_v_16[i] = src_v[i];
+ }
+
+ int dst_width_uv = dst_width;
+ int dst_height_uv = dst_height;
+
+ int dst_y_plane_size = (dst_width) * (dst_height);
+ int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+ int dst_stride_y = dst_width;
+ int dst_stride_uv = dst_width_uv;
+
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+ align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+ align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+ uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+ uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+ uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+ src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+ dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+ p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+ dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+ dst_stride_uv, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ for (i = 0; i < dst_uv_plane_size; ++i) {
+ int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_u_8);
+ free_aligned_buffer_page_end(dst_v_8);
+ free_aligned_buffer_page_end(dst_y_16);
+ free_aligned_buffer_page_end(dst_u_16);
+ free_aligned_buffer_page_end(dst_v_16);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_u);
+ free_aligned_buffer_page_end(src_v);
+ free_aligned_buffer_page_end(src_y_16);
+ free_aligned_buffer_page_end(src_u_16);
+ free_aligned_buffer_page_end(src_v_16);
+
+ return max_diff;
}
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
-// 2 is chroma subsample
+// 2 is chroma subsample.
#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
- TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) { \
- int diff = TestFilter( \
+ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \
+ int diff = I420TestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, DISABLED_ScaleDownBy##name##_##filter##_16) { \
- int diff = TestFilter_16( \
+ TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \
+ int diff = I444TestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
- kFilter##filter, benchmark_iterations_); \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) { \
+ int diff = I420TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) { \
+ int diff = I444TestFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
}
@@ -321,47 +551,354 @@
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
- TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
- int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
- kFilter##filter, benchmark_iterations_, \
- disable_cpu_flags_, benchmark_cpu_info_); \
+ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \
+ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
- int diff = TestFilter(width, height, Abs(benchmark_width_), \
- Abs(benchmark_height_), kFilter##filter, \
- benchmark_iterations_, disable_cpu_flags_, \
- benchmark_cpu_info_); \
+ TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) { \
+ int diff = I444TestFilter(benchmark_width_, benchmark_height_, width, \
+ height, kFilter##filter, benchmark_iterations_, \
+ disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \
+ int diff = I420TestFilter_16( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \
+ int diff = I444TestFilter_16( \
+ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \
+ int diff = I420TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) { \
+ int diff = I444TestFilter(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
- DISABLED_##name##To##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(benchmark_width_, benchmark_height_, width, \
- height, kFilter##filter, benchmark_iterations_); \
+ I420##name##From##width##x##height##_##filter##_16) { \
+ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, \
- DISABLED_##name##From##width##x##height##_##filter##_16) { \
- int diff = TestFilter_16(width, height, Abs(benchmark_width_), \
- Abs(benchmark_height_), kFilter##filter, \
- benchmark_iterations_); \
+ I444##name##From##width##x##height##_##filter##_16) { \
+ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \
+ Abs(benchmark_height_), kFilter##filter, \
+ benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
}
// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
- TEST_SCALETO1(name, width, height, Linear, 0) \
- TEST_SCALETO1(name, width, height, Bilinear, 0) \
- TEST_SCALETO1(name, width, height, Box, 0)
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(name, width, height, Box, 3)
TEST_SCALETO(Scale, 1, 1)
TEST_SCALETO(Scale, 320, 240)
-TEST_SCALETO(Scale, 352, 288)
TEST_SCALETO(Scale, 569, 480)
TEST_SCALETO(Scale, 640, 360)
TEST_SCALETO(Scale, 1280, 720)
+TEST_SCALETO(Scale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
+#ifdef HAS_SCALEROWDOWN2_SSSE3
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
+ SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
+ SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
+ SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ if (!has_ssse3) {
+ printf("Warning SSSE3 not detected; Skipping test.\n");
+ } else {
+ // TL.
+ orig_pixels[0] = 255u;
+ orig_pixels[1] = 0u;
+ orig_pixels[128 + 0] = 0u;
+ orig_pixels[128 + 1] = 0u;
+ // TR.
+ orig_pixels[2] = 0u;
+ orig_pixels[3] = 100u;
+ orig_pixels[128 + 2] = 0u;
+ orig_pixels[128 + 3] = 0u;
+ // BL.
+ orig_pixels[4] = 0u;
+ orig_pixels[5] = 0u;
+ orig_pixels[128 + 4] = 50u;
+ orig_pixels[128 + 5] = 0u;
+ // BR.
+ orig_pixels[6] = 0u;
+ orig_pixels[7] = 0u;
+ orig_pixels[128 + 6] = 0u;
+ orig_pixels[128 + 7] = 20u;
+ // Odd.
+ orig_pixels[126] = 4u;
+ orig_pixels[127] = 255u;
+ orig_pixels[128 + 126] = 16u;
+ orig_pixels[128 + 127] = 255u;
+
+ // Test regular half size.
+ ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(133u, dst_pixels_c[63]);
+
+ // Test Odd width version - Last pixel is just 1 horizontal pixel.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(10u, dst_pixels_c[63]);
+
+ // Test one pixel less, should skip the last pixel.
+ memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
+
+ EXPECT_EQ(64u, dst_pixels_c[0]);
+ EXPECT_EQ(25u, dst_pixels_c[1]);
+ EXPECT_EQ(13u, dst_pixels_c[2]);
+ EXPECT_EQ(5u, dst_pixels_c[3]);
+ EXPECT_EQ(0u, dst_pixels_c[4]);
+ EXPECT_EQ(0u, dst_pixels_c[63]);
+
+ // Test regular half size SSSE3.
+ ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+
+ EXPECT_EQ(64u, dst_pixels_opt[0]);
+ EXPECT_EQ(25u, dst_pixels_opt[1]);
+ EXPECT_EQ(13u, dst_pixels_opt[2]);
+ EXPECT_EQ(5u, dst_pixels_opt[3]);
+ EXPECT_EQ(0u, dst_pixels_opt[4]);
+ EXPECT_EQ(133u, dst_pixels_opt[63]);
+
+ // Compare C and SSSE3 match.
+ ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
+ ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
+ for (int i = 0; i < 64; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ }
+}
+#endif // HAS_SCALEROWDOWN2_SSSE3
+
+extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+
+TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) {
+ SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun.
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt));
+ memset(dst_pixels_c, 2, sizeof(dst_pixels_c));
+
+ for (int i = 0; i < 640 * 2 + 1; ++i) {
+ orig_pixels[i] = i;
+ }
+ ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ } else {
+ ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ }
+#elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ if (has_mmi) {
+ ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ } else {
+ ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+ }
+#else
+ ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280);
+#endif
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+ EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16);
+ EXPECT_EQ(dst_pixels_c[1279], 800);
+}
+
+extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+ ptrdiff_t src_stride,
+ uint16_t* dst,
+ int dst_width);
+
+TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
+ SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
+ SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
+ SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
+
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+ memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
+ memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
+
+ for (int i = 0; i < 2560 * 2; ++i) {
+ orig_pixels[i] = i;
+ }
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ if (has_neon) {
+ ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+ } else {
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+ }
+#else
+ ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280);
+#endif
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
+ EXPECT_EQ(dst_pixels_c[1279], 3839);
+}
+
+// Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
+// difference.
+// 0 = exact.
+static int TestPlaneFilter_16(int src_width,
+ int src_height,
+ int dst_width,
+ int dst_height,
+ FilterMode f,
+ int benchmark_iterations,
+ int disable_cpu_flags,
+ int benchmark_cpu_info) {
+ if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+ return 0;
+ }
+
+ int i;
+ int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+ int src_stride_y = Abs(src_width);
+ int dst_y_plane_size = dst_width * dst_height;
+ int dst_stride_y = dst_width;
+
+ align_buffer_page_end(src_y, src_y_plane_size);
+ align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+ align_buffer_page_end(dst_y_8, dst_y_plane_size);
+ align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+ uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+ uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+
+ MemRandomize(src_y, src_y_plane_size);
+ memset(dst_y_8, 0, dst_y_plane_size);
+ memset(dst_y_16, 1, dst_y_plane_size * 2);
+
+ for (i = 0; i < src_y_plane_size; ++i) {
+ p_src_y_16[i] = src_y[i] & 255;
+ }
+
+ MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
+ ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y,
+ dst_width, dst_height, f);
+ MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
+
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16,
+ dst_stride_y, dst_width, dst_height, f);
+ }
+
+ // Expect an exact match.
+ int max_diff = 0;
+ for (i = 0; i < dst_y_plane_size; ++i) {
+ int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_8);
+ free_aligned_buffer_page_end(dst_y_16);
+ free_aligned_buffer_page_end(src_y);
+ free_aligned_buffer_page_end(src_y_16);
+
+ return max_diff;
+}
+
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+// 2 is chroma subsample.
+#define DX(x, nom, denom) static_cast<int>(((Abs(x) / nom + 1) / 2) * nom * 2)
+#define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
+ TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \
+ int diff = TestPlaneFilter_16( \
+ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
+ benchmark_cpu_info_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, None, nom, denom, 0) \
+ TEST_FACTOR1(name, Linear, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \
+ TEST_FACTOR1(name, Box, nom, denom, boxdiff)
+
+TEST_FACTOR(2, 1, 2, 0)
+TEST_FACTOR(4, 1, 4, 0)
+TEST_FACTOR(8, 1, 8, 0)
+TEST_FACTOR(3by4, 3, 4, 1)
+TEST_FACTOR(3by8, 3, 8, 1)
+TEST_FACTOR(3, 1, 3, 0)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
} // namespace libyuv
diff --git a/files/unit_test/testdata/juno.txt b/files/unit_test/testdata/juno.txt
index c275be7..dd46527 100644
--- a/files/unit_test/testdata/juno.txt
+++ b/files/unit_test/testdata/juno.txt
@@ -1,15 +1,15 @@
-Processor : AArch64 Processor rev 0 (aarch64)
-processor : 0
-processor : 1
-processor : 2
-processor : 3
-processor : 4
-processor : 5
-Features : fp asimd evtstrm aes pmull sha1 sha2 crc32
-CPU implementer : 0x41
-CPU architecture: AArch64
-CPU variant : 0x0
-CPU part : 0xd07
-CPU revision : 0
-
-Hardware : Juno
+Processor : AArch64 Processor rev 0 (aarch64)
+processor : 0
+processor : 1
+processor : 2
+processor : 3
+processor : 4
+processor : 5
+Features : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: AArch64
+CPU variant : 0x0
+CPU part : 0xd07
+CPU revision : 0
+
+Hardware : Juno
diff --git a/files/unit_test/testdata/test0.jpg b/files/unit_test/testdata/test0.jpg
new file mode 100644
index 0000000..f4461a8
--- /dev/null
+++ b/files/unit_test/testdata/test0.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test1.jpg b/files/unit_test/testdata/test1.jpg
new file mode 100644
index 0000000..a0210e9
--- /dev/null
+++ b/files/unit_test/testdata/test1.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test2.jpg b/files/unit_test/testdata/test2.jpg
new file mode 100644
index 0000000..816ca76
--- /dev/null
+++ b/files/unit_test/testdata/test2.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test3.jpg b/files/unit_test/testdata/test3.jpg
new file mode 100644
index 0000000..792d91d
--- /dev/null
+++ b/files/unit_test/testdata/test3.jpg
Binary files differ
diff --git a/files/unit_test/testdata/test4.jpg b/files/unit_test/testdata/test4.jpg
new file mode 100644
index 0000000..1ef4166
--- /dev/null
+++ b/files/unit_test/testdata/test4.jpg
Binary files differ
diff --git a/files/unit_test/unit_test.cc b/files/unit_test/unit_test.cc
index 55297e3..a1ae7ea 100644
--- a/files/unit_test/unit_test.cc
+++ b/files/unit_test/unit_test.cc
@@ -17,10 +17,7 @@
#ifdef LIBYUV_USE_GFLAGS
#include "gflags/gflags.h"
#endif
-
-// Change this to 1000 for benchmarking.
-// TODO(fbarchard): Add command line parsing to pass this as option.
-#define BENCHMARK_ITERATIONS 1
+#include "libyuv/cpu_id.h"
unsigned int fastrand_seed = 0xfb;
@@ -34,19 +31,112 @@
"cpu flags for benchmark code. 1 = C, -1 = SIMD");
#else
// Disable command line parameters if gflags disabled.
-static const int32 FLAGS_libyuv_width = 0;
-static const int32 FLAGS_libyuv_height = 0;
-static const int32 FLAGS_libyuv_repeat = 0;
-static const int32 FLAGS_libyuv_flags = 0;
-static const int32 FLAGS_libyuv_cpu_info = 0;
+static const int32_t FLAGS_libyuv_width = 0;
+static const int32_t FLAGS_libyuv_height = 0;
+static const int32_t FLAGS_libyuv_repeat = 0;
+static const int32_t FLAGS_libyuv_flags = 0;
+static const int32_t FLAGS_libyuv_cpu_info = 0;
#endif
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+static LIBYUV_BOOL TestEnv(const char* name) {
+ const char* var = getenv(name);
+ if (var) {
+ if (var[0] != '0') {
+ return LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_FALSE;
+}
+#else // nacl does not support getenv().
+static LIBYUV_BOOL TestEnv(const char*) {
+ return LIBYUV_FALSE;
+}
+#endif
+
+int TestCpuEnv(int cpu_info) {
+#if defined(__arm__) || defined(__aarch64__)
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info &= ~libyuv::kCpuHasNEON;
+ }
+#endif
+#if defined(__mips__) && defined(__linux__)
+ if (TestEnv("LIBYUV_DISABLE_MSA")) {
+ cpu_info &= ~libyuv::kCpuHasMSA;
+ }
+ if (TestEnv("LIBYUV_DISABLE_MMI")) {
+ cpu_info &= ~libyuv::kCpuHasMMI;
+ }
+#endif
+#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+ defined(_M_IX86))
+ if (TestEnv("LIBYUV_DISABLE_X86")) {
+ cpu_info &= ~libyuv::kCpuHasX86;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+ cpu_info &= ~libyuv::kCpuHasSSE2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+ cpu_info &= ~libyuv::kCpuHasSSSE3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+ cpu_info &= ~libyuv::kCpuHasSSE41;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+ cpu_info &= ~libyuv::kCpuHasSSE42;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX")) {
+ cpu_info &= ~libyuv::kCpuHasAVX;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+ cpu_info &= ~libyuv::kCpuHasAVX2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+ cpu_info &= ~libyuv::kCpuHasERMS;
+ }
+ if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+ cpu_info &= ~libyuv::kCpuHasFMA3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_F16C")) {
+ cpu_info &= ~libyuv::kCpuHasF16C;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512BW")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512BW;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512VL")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VL;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512VBMI")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VBMI;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512VBMI2")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VBMI2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512VBITALG")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VBITALG;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX512VPOPCNTDQ")) {
+ cpu_info &= ~libyuv::kCpuHasAVX512VPOPCNTDQ;
+ }
+ if (TestEnv("LIBYUV_DISABLE_GFNI")) {
+ cpu_info &= ~libyuv::kCpuHasGFNI;
+ }
+#endif
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info = libyuv::kCpuInitialized;
+ }
+ return cpu_info;
+}
+
// For quicker unittests, default is 128 x 72. But when benchmarking,
// default to 720p. Allow size to specify.
// Set flags to -1 for benchmarking to avoid slower C code.
LibYUVConvertTest::LibYUVConvertTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -90,12 +180,9 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
- static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
@@ -105,7 +192,7 @@
}
LibYUVColorTest::LibYUVColorTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -149,12 +236,9 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
- static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
@@ -164,7 +248,7 @@
}
LibYUVScaleTest::LibYUVScaleTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -208,12 +292,9 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
- static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
@@ -223,7 +304,7 @@
}
LibYUVRotateTest::LibYUVRotateTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -267,12 +348,9 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
- static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
@@ -282,7 +360,7 @@
}
LibYUVPlanarTest::LibYUVPlanarTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -326,12 +404,9 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
- static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
- static_cast<double>(Abs(benchmark_height_)) *
- static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
@@ -341,7 +416,7 @@
}
LibYUVBaseTest::LibYUVBaseTest()
- : benchmark_iterations_(BENCHMARK_ITERATIONS),
+ : benchmark_iterations_(1),
benchmark_width_(128),
benchmark_height_(72),
disable_cpu_flags_(1),
@@ -385,12 +460,65 @@
if (FLAGS_libyuv_cpu_info) {
benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
}
- benchmark_pixels_div256_ =
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
+ benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
static_cast<double>(benchmark_iterations_) +
- 255.0) /
- 256.0);
+ 1279.0) /
+ 1280.0);
+}
+
+LibYUVCompareTest::LibYUVCompareTest()
+ : benchmark_iterations_(1),
+ benchmark_width_(128),
+ benchmark_height_(72),
+ disable_cpu_flags_(1),
+ benchmark_cpu_info_(-1) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ }
+ if (FLAGS_libyuv_repeat) {
+ benchmark_iterations_ = FLAGS_libyuv_repeat;
+ }
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ if (FLAGS_libyuv_width) {
+ benchmark_width_ = FLAGS_libyuv_width;
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ if (FLAGS_libyuv_height) {
+ benchmark_height_ = FLAGS_libyuv_height;
+ }
+ const char* cpu_flags = getenv("LIBYUV_FLAGS");
+ if (cpu_flags) {
+ disable_cpu_flags_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_flags) {
+ disable_cpu_flags_ = FLAGS_libyuv_flags;
+ }
+ const char* cpu_info = getenv("LIBYUV_CPU_INFO");
+ if (cpu_info) {
+ benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT
+ }
+ if (FLAGS_libyuv_cpu_info) {
+ benchmark_cpu_info_ = FLAGS_libyuv_cpu_info;
+ }
+ disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_);
+ benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_);
+ libyuv::MaskCpuFlags(benchmark_cpu_info_);
benchmark_pixels_div1280_ =
static_cast<int>((static_cast<double>(Abs(benchmark_width_)) *
static_cast<double>(Abs(benchmark_height_)) *
diff --git a/files/unit_test/unit_test.h b/files/unit_test/unit_test.h
index f7d60a7..87907fa 100644
--- a/files/unit_test/unit_test.h
+++ b/files/unit_test/unit_test.h
@@ -36,6 +36,9 @@
return v >= 0 ? v : -v;
}
+static __inline float FAbs(float v) {
+ return v >= 0 ? v : -v;
+}
#define OFFBY 0
// Scaling uses 16.16 fixed point to step thru the source image, so a
@@ -66,17 +69,15 @@
return true;
}
-#define align_buffer_page_end(var, size) \
- uint8* var; \
- uint8* var##_mem; \
- var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
- var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - \
- (size)) & \
- ~63);
+#define align_buffer_page_end(var, size) \
+ uint8_t* var##_mem = \
+ reinterpret_cast<uint8_t*>(malloc(((size) + 4095 + 63) & ~4095)); \
+ uint8_t* var = reinterpret_cast<uint8_t*>( \
+ (intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - (size)) & ~63)
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \
- var = 0;
+ var = 0
#ifdef WIN32
static inline double get_time() {
@@ -110,10 +111,10 @@
return static_cast<int>((fastrand_seed >> 16) & 0xffff);
}
-static inline void MemRandomize(uint8* dst, int64 len) {
- int64 i;
+static inline void MemRandomize(uint8_t* dst, int64_t len) {
+ int64_t i;
for (i = 0; i < len - 1; i += 2) {
- *reinterpret_cast<uint16*>(dst) = fastrand();
+ *reinterpret_cast<uint16_t*>(dst) = fastrand();
dst += 2;
}
for (; i < len; ++i) {
@@ -125,10 +126,9 @@
protected:
LibYUVColorTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
@@ -138,10 +138,9 @@
protected:
LibYUVConvertTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
@@ -151,10 +150,9 @@
protected:
LibYUVScaleTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
@@ -164,10 +162,9 @@
protected:
LibYUVRotateTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
@@ -177,10 +174,9 @@
protected:
LibYUVPlanarTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
@@ -190,10 +186,21 @@
protected:
LibYUVBaseTest();
- int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
- int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
- int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
- int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
+ int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
+ int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
+};
+
+class LibYUVCompareTest : public ::testing::Test {
+ protected:
+ LibYUVCompareTest();
+
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
int disable_cpu_flags_; // Default 1. Use -1 for benchmarking.
int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD.
diff --git a/files/unit_test/video_common_test.cc b/files/unit_test/video_common_test.cc
index f16b677..a84206a 100644
--- a/files/unit_test/video_common_test.cc
+++ b/files/unit_test/video_common_test.cc
@@ -18,15 +18,12 @@
// Tests FourCC codes in video common, which are used for ConvertToI420().
-static bool TestValidChar(uint32 onecc) {
- if ((onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') ||
- (onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff)) {
- return true;
- }
- return false;
+static bool TestValidChar(uint32_t onecc) {
+ return (onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') ||
+ (onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff);
}
-static bool TestValidFourCC(uint32 fourcc, int bpp) {
+static bool TestValidFourCC(uint32_t fourcc, int bpp) {
if (!TestValidChar(fourcc & 0xff) || !TestValidChar((fourcc >> 8) & 0xff) ||
!TestValidChar((fourcc >> 16) & 0xff) ||
!TestValidChar((fourcc >> 24) & 0xff)) {
@@ -39,23 +36,23 @@
}
TEST_F(LibYUVBaseTest, TestCanonicalFourCC) {
- EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
- EXPECT_EQ(static_cast<uint32>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
- EXPECT_EQ(static_cast<uint32>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
- EXPECT_EQ(static_cast<uint32>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
- EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
- EXPECT_EQ(static_cast<uint32>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
- EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
- EXPECT_EQ(static_cast<uint32>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
- EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
- EXPECT_EQ(static_cast<uint32>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
- EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
- EXPECT_EQ(static_cast<uint32>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
- EXPECT_EQ(static_cast<uint32>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
- EXPECT_EQ(static_cast<uint32>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
- EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
- EXPECT_EQ(static_cast<uint32>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
- EXPECT_EQ(static_cast<uint32>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_I420), CanonicalFourCC(FOURCC_YU12));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_I422), CanonicalFourCC(FOURCC_YU16));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_I444), CanonicalFourCC(FOURCC_YU24));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565));
+ EXPECT_EQ(static_cast<uint32_t>(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551));
}
TEST_F(LibYUVBaseTest, TestFourCC) {
@@ -73,12 +70,17 @@
EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30));
EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010));
EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
diff --git a/files/util/Makefile b/files/util/Makefile
index 6044d2a..40e74b6 100644
--- a/files/util/Makefile
+++ b/files/util/Makefile
@@ -4,3 +4,6 @@
else
$(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
endif
+
+# for MacOS
+# /usr/local/bin/g++-7 -msse2 -O3 -fopenmp -Bstatic -o psnr psnr.cc ssim.cc psnr_main.cc
diff --git a/files/util/compare.cc b/files/util/compare.cc
index ef0beef..a16613e 100644
--- a/files/util/compare.cc
+++ b/files/util/compare.cc
@@ -29,22 +29,24 @@
FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
const int kBlockSize = 32768;
- uint8 buf1[kBlockSize];
- uint8 buf2[kBlockSize];
- uint32 hash1 = 5381;
- uint32 hash2 = 5381;
- uint64 sum_square_err = 0;
- uint64 size_min = 0;
+ uint8_t buf1[kBlockSize];
+ uint8_t buf2[kBlockSize];
+ uint32_t hash1 = 5381;
+ uint32_t hash2 = 5381;
+ uint64_t sum_square_err = 0;
+ uint64_t size_min = 0;
int amt1 = 0;
int amt2 = 0;
do {
amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
- if (amt1 > 0)
+ if (amt1 > 0) {
hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+ }
if (fin2) {
amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
- if (amt2 > 0)
+ if (amt2 > 0) {
hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+ }
int amt_min = (amt1 < amt2) ? amt1 : amt2;
size_min += amt_min;
sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
diff --git a/files/util/cpuid.c b/files/util/cpuid.c
index 9716f11..84c0602 100644
--- a/files/util/cpuid.c
+++ b/files/util/cpuid.c
@@ -22,10 +22,13 @@
int has_arm = TestCpuFlag(kCpuHasARM);
int has_mips = TestCpuFlag(kCpuHasMIPS);
int has_x86 = TestCpuFlag(kCpuHasX86);
+ (void)argc;
+ (void)argv;
+
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
if (has_x86) {
- uint32 family, model, cpu_info[4];
+ int family, model, cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
@@ -66,8 +69,10 @@
printf("Has NEON %x\n", has_neon);
}
if (has_mips) {
- int has_dspr2 = TestCpuFlag(kCpuHasDSPR2);
- printf("Has DSPR2 %x\n", has_dspr2);
+ int has_msa = TestCpuFlag(kCpuHasMSA);
+ printf("Has MSA %x\n", has_msa);
+ int has_mmi = TestCpuFlag(kCpuHasMMI);
+ printf("Has MMI %x\n", has_mmi);
}
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@@ -76,20 +81,32 @@
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
- int has_avx3 = TestCpuFlag(kCpuHasAVX3);
int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
- int has_f16c = TestCpuFlag(kCpuHasF16C);
+ int has_f16c = TestCpuFlag(kCpuHasF16C);
+ int has_gfni = TestCpuFlag(kCpuHasGFNI);
+ int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+ int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+ int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+ int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+ int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+ int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ);
printf("Has SSE2 %x\n", has_sse2);
printf("Has SSSE3 %x\n", has_ssse3);
printf("Has SSE4.1 %x\n", has_sse41);
printf("Has SSE4.2 %x\n", has_sse42);
printf("Has AVX %x\n", has_avx);
printf("Has AVX2 %x\n", has_avx2);
- printf("Has AVX3 %x\n", has_avx3);
printf("Has ERMS %x\n", has_erms);
printf("Has FMA3 %x\n", has_fma3);
printf("Has F16C %x\n", has_f16c);
+ printf("Has GFNI %x\n", has_gfni);
+ printf("Has AVX512BW %x\n", has_avx512bw);
+ printf("Has AVX512VL %x\n", has_avx512vl);
+ printf("Has AVX512VBMI %x\n", has_avx512vbmi);
+ printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2);
+ printf("Has AVX512VBITALG %x\n", has_avx512vbitalg);
+ printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq);
}
return 0;
}
diff --git a/files/util/psnr.cc b/files/util/psnr.cc
index 27f876c..c7bee7f 100644
--- a/files/util/psnr.cc
+++ b/files/util/psnr.cc
@@ -21,14 +21,14 @@
extern "C" {
#endif
-typedef unsigned int uint32; // NOLINT
+typedef unsigned int uint32_t; // NOLINT
#ifdef _MSC_VER
-typedef unsigned __int64 uint64;
+typedef unsigned __int64 uint64_t;
#else // COMPILER_MSVC
#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64; // NOLINT
+typedef unsigned long uint64_t; // NOLINT
#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64; // NOLINT
+typedef unsigned long long uint64_t; // NOLINT
#endif // __LP64__
#endif // _MSC_VER
@@ -38,10 +38,10 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
-static uint32 SumSquareError_NEON(const uint8* src_a,
- const uint8* src_b,
- int count) {
- volatile uint32 sse;
+static uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ volatile uint32_t sse;
asm volatile(
"vmov.u8 q7, #0 \n"
"vmov.u8 q9, #0 \n"
@@ -73,10 +73,10 @@
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SUMSQUAREERROR_NEON
-static uint32 SumSquareError_NEON(const uint8* src_a,
- const uint8* src_b,
- int count) {
- volatile uint32 sse;
+static uint32_t SumSquareError_NEON(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ volatile uint32_t sse;
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
@@ -107,9 +107,9 @@
}
#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#define HAS_SUMSQUAREERROR_SSE2
-__declspec(naked) static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
- const uint8* /*src_b*/,
- int /*count*/) {
+__declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/,
+ const uint8_t* /*src_b*/,
+ int /*count*/) {
__asm {
mov eax, [esp + 4] // src_a
mov edx, [esp + 8] // src_b
@@ -146,10 +146,10 @@
}
#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
-static uint32 SumSquareError_SSE2(const uint8* src_a,
- const uint8* src_b,
- int count) {
- uint32 sse;
+static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse;
asm volatile( // NOLINT
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
@@ -189,7 +189,7 @@
,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
- ); // NOLINT
+ ); // NOLINT
return sse;
}
#endif // LIBYUV_DISABLE_X86 etc
@@ -228,22 +228,22 @@
}
#endif // HAS_SUMSQUAREERROR_SSE2
-static uint32 SumSquareError_C(const uint8* src_a,
- const uint8* src_b,
- int count) {
- uint32 sse = 0u;
+static uint32_t SumSquareError_C(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count) {
+ uint32_t sse = 0u;
for (int x = 0; x < count; ++x) {
int diff = src_a[x] - src_b[x];
- sse += static_cast<uint32>(diff * diff);
+ sse += static_cast<uint32_t>(diff * diff);
}
return sse;
}
-double ComputeSumSquareError(const uint8* src_a,
- const uint8* src_b,
+double ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
int count) {
- uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
- SumSquareError_C;
+ uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+ int count) = SumSquareError_C;
#if defined(HAS_SUMSQUAREERROR_NEON)
SumSquareError = SumSquareError_NEON;
#endif
@@ -253,7 +253,7 @@
}
#endif
const int kBlockSize = 1 << 15;
- uint64 sse = 0;
+ uint64_t sse = 0;
#ifdef _OPENMP
#pragma omp parallel for reduction(+ : sse)
#endif
@@ -280,8 +280,9 @@
// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
double ComputePSNR(double sse, double size) {
const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0);
- if (sse <= kMINSSE)
+ if (sse <= kMINSSE) {
sse = kMINSSE; // Produces max PSNR of 128
+ }
return 10.0 * log10(255.0 * 255.0 * size / sse);
}
diff --git a/files/util/psnr.h b/files/util/psnr.h
index 0816b97..aac128c 100644
--- a/files/util/psnr.h
+++ b/files/util/psnr.h
@@ -20,7 +20,7 @@
#endif
#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
-typedef unsigned char uint8;
+typedef unsigned char uint8_t;
#define UINT8_TYPE_DEFINED
#endif
@@ -31,7 +31,9 @@
#if !defined(HAVE_JPEG)
// Computer Sum of Squared Error (SSE).
// Pass this to ComputePSNR for final result.
-double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+double ComputeSumSquareError(const uint8_t* src_a,
+ const uint8_t* src_b,
+ int count);
#endif
// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
diff --git a/files/util/psnr_main.cc b/files/util/psnr_main.cc
index 4d930be..a930b20 100644
--- a/files/util/psnr_main.cc
+++ b/files/util/psnr_main.cc
@@ -90,9 +90,9 @@
fseek(file_org, 0, SEEK_END);
size_t total_size = ftell(file_org);
fseek(file_org, 0, SEEK_SET);
- uint8* const ch_org = new uint8[total_size];
+ uint8_t* const ch_org = new uint8_t[total_size];
memset(ch_org, 0, total_size);
- size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+ size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
fclose(file_org);
if (bytes_org == total_size) {
if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) {
@@ -107,13 +107,15 @@
// Scale Y channel from 16..240 to 0..255.
// This can be useful when comparing codecs that are inconsistant about Y
-uint8 ScaleY(uint8 y) {
+uint8_t ScaleY(uint8_t y) {
int ny = (y - 16) * 256 / 224;
- if (ny < 0)
+ if (ny < 0) {
ny = 0;
- if (ny > 255)
+ }
+ if (ny > 255) {
ny = 255;
- return static_cast<uint8>(ny);
+ }
+ return static_cast<uint8_t>(ny);
}
// MSE = Mean Square Error
@@ -150,8 +152,9 @@
}
void ParseOptions(int argc, const char* argv[]) {
- if (argc <= 1)
+ if (argc <= 1) {
PrintHelp(argv[0]);
+ }
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@@ -237,8 +240,8 @@
}
}
-bool UpdateMetrics(uint8* ch_org,
- uint8* ch_rec,
+bool UpdateMetrics(uint8_t* ch_org,
+ uint8_t* ch_rec,
const int y_size,
const int uv_size,
const size_t total_size,
@@ -247,10 +250,10 @@
metric* distorted_frame,
bool do_psnr) {
const int uv_offset = (do_swap_uv ? uv_size : 0);
- const uint8* const u_org = ch_org + y_size + uv_offset;
- const uint8* const u_rec = ch_rec + y_size;
- const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
- const uint8* const v_rec = ch_rec + y_size + uv_size;
+ const uint8_t* const u_org = ch_org + y_size + uv_offset;
+ const uint8_t* const u_rec = ch_rec + y_size;
+ const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset);
+ const uint8_t* const v_rec = ch_rec + y_size + uv_size;
if (do_psnr) {
#ifdef HAVE_JPEG
double y_err = static_cast<double>(
@@ -301,12 +304,15 @@
cur_distortion_psnr->all += distorted_frame->all;
bool ismin = false;
- if (distorted_frame->y < cur_distortion_psnr->min_y)
+ if (distorted_frame->y < cur_distortion_psnr->min_y) {
cur_distortion_psnr->min_y = distorted_frame->y;
- if (distorted_frame->u < cur_distortion_psnr->min_u)
+ }
+ if (distorted_frame->u < cur_distortion_psnr->min_u) {
cur_distortion_psnr->min_u = distorted_frame->u;
- if (distorted_frame->v < cur_distortion_psnr->min_v)
+ }
+ if (distorted_frame->v < cur_distortion_psnr->min_v) {
cur_distortion_psnr->min_v = distorted_frame->v;
+ }
if (distorted_frame->all < cur_distortion_psnr->min_all) {
cur_distortion_psnr->min_all = distorted_frame->all;
cur_distortion_psnr->min_frame = number_of_frames;
@@ -374,8 +380,8 @@
#endif
}
- uint8* const ch_org = new uint8[total_size];
- uint8* const ch_rec = new uint8[total_size];
+ uint8_t* const ch_org = new uint8_t[total_size];
+ uint8_t* const ch_rec = new uint8_t[total_size];
if (ch_org == NULL || ch_rec == NULL) {
fprintf(stderr, "No memory available\n");
fclose(file_org);
@@ -429,14 +435,15 @@
int number_of_frames;
for (number_of_frames = 0;; ++number_of_frames) {
- if (num_frames && number_of_frames >= num_frames)
+ if (num_frames && number_of_frames >= num_frames) {
break;
+ }
- size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+ size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org);
if (bytes_org < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
- uint8* const ch_jpeg = new uint8[bytes_org];
+ uint8_t* const ch_jpeg = new uint8_t[bytes_org];
memcpy(ch_jpeg, ch_org, bytes_org);
memset(ch_org, 0, total_size);
@@ -456,11 +463,11 @@
for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
size_t bytes_rec =
- fread(ch_rec, sizeof(uint8), total_size, file_rec[cur_rec]);
+ fread(ch_rec, sizeof(uint8_t), total_size, file_rec[cur_rec]);
if (bytes_rec < total_size) {
#ifdef HAVE_JPEG
// Try parsing file as a jpeg.
- uint8* const ch_jpeg = new uint8[bytes_rec];
+ uint8_t* const ch_jpeg = new uint8_t[bytes_rec];
memcpy(ch_jpeg, ch_rec, bytes_rec);
memset(ch_rec, 0, total_size);
@@ -482,7 +489,7 @@
printf("%5d", number_of_frames);
}
if (do_psnr) {
- metric distorted_frame;
+ metric distorted_frame = {};
metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
number_of_frames, cur_distortion_psnr,
@@ -496,7 +503,7 @@
}
}
if (do_ssim) {
- metric distorted_frame;
+ metric distorted_frame = {};
metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size,
number_of_frames, cur_distortion_ssim,
diff --git a/files/util/ssim.cc b/files/util/ssim.cc
index 43e725d..096fbcf 100644
--- a/files/util/ssim.cc
+++ b/files/util/ssim.cc
@@ -16,8 +16,8 @@
extern "C" {
#endif
-typedef unsigned int uint32; // NOLINT
-typedef unsigned short uint16; // NOLINT
+typedef unsigned int uint32_t; // NOLINT
+typedef unsigned short uint16_t; // NOLINT
#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
(defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
@@ -50,7 +50,7 @@
#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
-#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
+#define PWEIGHT(A, B) static_cast<uint16_t>(K[(A)] * K[(B)]) // weight product
#define MAKE_WEIGHT(L) \
{ \
{ \
@@ -66,7 +66,7 @@
// values. We can't call _mm_set_epi16() for static compile-time initialization.
static const struct {
union {
- uint16 i16_[8];
+ uint16_t i16_[8];
__m128i m_;
} values_;
} W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2),
@@ -88,10 +88,12 @@
double sxx = xxm * iw - iwx * iwx;
double syy = yym * iw - iwy * iwy;
// small errors are possible, due to rounding. Clamp to zero.
- if (sxx < 0.)
+ if (sxx < 0.) {
sxx = 0.;
- if (syy < 0.)
+ }
+ if (syy < 0.) {
syy = 0.;
+ }
const double sxsy = sqrt(sxx * syy);
const double sxy = xym * iw - iwx * iwy;
static const double C11 = (0.01 * 0.01) * (255 * 255);
@@ -109,21 +111,22 @@
// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
// with a diff of 255, squared. The maximum error is thus 0x4388241,
// which fits into 32 bits integers.
-double GetSSIM(const uint8* org,
- const uint8* rec,
+double GetSSIM(const uint8_t* org,
+ const uint8_t* rec,
int xo,
int yo,
int W,
int H,
int stride) {
- uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+ uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
org += (yo - KERNEL) * stride;
org += (xo - KERNEL);
rec += (yo - KERNEL) * stride;
rec += (xo - KERNEL);
for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
- if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H))
+ if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) {
continue;
+ }
const int Wy = K[y_];
for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
const int Wxy = Wy * K[x_];
@@ -142,13 +145,13 @@
return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
}
-double GetSSIMFullKernel(const uint8* org,
- const uint8* rec,
+double GetSSIMFullKernel(const uint8_t* org,
+ const uint8_t* rec,
int xo,
int yo,
int stride,
double area_weight) {
- uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+ uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
@@ -262,7 +265,7 @@
#define ADD_AND_STORE_FOUR_EPI32(M, OUT) \
do { \
- uint32 tmp[4]; \
+ uint32_t tmp[4]; \
_mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
(OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
} while (0)
@@ -292,8 +295,8 @@
return (x > y) ? x : y;
}
-double CalcSSIM(const uint8* org,
- const uint8* rec,
+double CalcSSIM(const uint8_t* org,
+ const uint8_t* rec,
const int image_width,
const int image_height) {
double SSIM = 0.;
@@ -328,8 +331,8 @@
// NOTE: we could use similar method for the left-most pixels too.
const int kScratchWidth = 8;
const int kScratchStride = kScratchWidth + KERNEL + 1;
- uint8 scratch_org[KERNEL_SIZE * kScratchStride] = {0};
- uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
+ uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0};
+ uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0};
for (int k = 0; k < KERNEL_SIZE; ++k) {
const int offset =
diff --git a/files/util/ssim.h b/files/util/ssim.h
index 4647f45..a855f1d 100644
--- a/files/util/ssim.h
+++ b/files/util/ssim.h
@@ -20,12 +20,12 @@
#endif
#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
-typedef unsigned char uint8;
+typedef unsigned char uint8_t;
#define UINT8_TYPE_DEFINED
#endif
-double CalcSSIM(const uint8* org,
- const uint8* rec,
+double CalcSSIM(const uint8_t* org,
+ const uint8_t* rec,
const int image_width,
const int image_height);
diff --git a/files/util/yuvconvert.cc b/files/util/yuvconvert.cc
index bc01d9f..27cdfe9 100644
--- a/files/util/yuvconvert.cc
+++ b/files/util/yuvconvert.cc
@@ -37,7 +37,7 @@
int num_frames = 0; // Number of frames to convert.
int filter = 1; // Bilinear filter for scaling.
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
return v >= 0 ? v : -v;
}
@@ -79,8 +79,9 @@
}
void ParseOptions(int argc, const char* argv[]) {
- if (argc <= 1)
+ if (argc <= 1) {
PrintHelp(argv[0]);
+ }
for (int c = 1; c < argc; ++c) {
if (!strcmp(argv[c], "-v")) {
verbose = true;
@@ -158,11 +159,11 @@
static const int kTileX = 32;
static const int kTileY = 32;
-static int TileARGBScale(const uint8* src_argb,
+static int TileARGBScale(const uint8_t* src_argb,
int src_stride_argb,
int src_width,
int src_height,
- uint8* dst_argb,
+ uint8_t* dst_argb,
int dst_stride_argb,
int dst_width,
int dst_height,
@@ -242,9 +243,9 @@
fseek(file_org, num_skip_org * total_size, SEEK_SET);
#endif
- uint8* const ch_org = new uint8[org_size];
- uint8* const ch_dst = new uint8[dst_size];
- uint8* const ch_rec = new uint8[total_size];
+ uint8_t* const ch_org = new uint8_t[org_size];
+ uint8_t* const ch_dst = new uint8_t[dst_size];
+ uint8_t* const ch_rec = new uint8_t[total_size];
if (ch_org == NULL || ch_rec == NULL) {
fprintf(stderr, "No memory available\n");
fclose(file_org);
@@ -265,14 +266,16 @@
int number_of_frames;
for (number_of_frames = 0;; ++number_of_frames) {
- if (num_frames && number_of_frames >= num_frames)
+ if (num_frames && number_of_frames >= num_frames) {
break;
+ }
// Load original YUV or ARGB frame.
size_t bytes_org =
- fread(ch_org, sizeof(uint8), static_cast<size_t>(org_size), file_org);
- if (bytes_org < static_cast<size_t>(org_size))
+ fread(ch_org, sizeof(uint8_t), static_cast<size_t>(org_size), file_org);
+ if (bytes_org < static_cast<size_t>(org_size)) {
break;
+ }
// TODO(fbarchard): Attenuate doesnt need to know dimensions.
// ARGB attenuate frame
@@ -329,16 +332,18 @@
// Output YUV or ARGB frame.
if (rec_is_yuv) {
size_t bytes_rec =
- fwrite(ch_rec, sizeof(uint8), static_cast<size_t>(total_size),
+ fwrite(ch_rec, sizeof(uint8_t), static_cast<size_t>(total_size),
file_rec[cur_rec]);
- if (bytes_rec < static_cast<size_t>(total_size))
+ if (bytes_rec < static_cast<size_t>(total_size)) {
break;
+ }
} else {
size_t bytes_rec =
- fwrite(ch_dst, sizeof(uint8), static_cast<size_t>(dst_size),
+ fwrite(ch_dst, sizeof(uint8_t), static_cast<size_t>(dst_size),
file_rec[cur_rec]);
- if (bytes_rec < static_cast<size_t>(dst_size))
+ if (bytes_rec < static_cast<size_t>(dst_size)) {
break;
+ }
}
if (verbose) {
printf("%5d", number_of_frames);