Snap for 8253222 from 1f65662f464ec9b476981a5067dc949612e9adde to sdk-release
Change-Id: I380b08c675ac9177cb0b2ec5b554a75be6d90021
diff --git a/Android.bp b/Android.bp
index 416bfd8..ea7a7bd 100644
--- a/Android.bp
+++ b/Android.bp
@@ -60,17 +60,11 @@
"-Werror",
],
- conlyflags: ["-std=c99"],
+ c_std: "gnu11",
// Build BoringSSL and its tests against the same STL.
sdk_version: "9",
target: {
- linux: {
- cflags: ["-D_XOPEN_SOURCE=700"],
- },
- windows: {
- cflags: ["-D_XOPEN_SOURCE=700"],
- },
android: {
stl: "libc++_static",
},
@@ -421,53 +415,6 @@
},
}
-// Used for CAVP testing for FIPS certification.
-// Not installed on devices by default.
-cc_binary {
- name: "cavp",
- host_supported: true,
- srcs: [
- "src/util/fipstools/cavp/cavp_aes_gcm_test.cc",
- "src/util/fipstools/cavp/cavp_aes_test.cc",
- "src/util/fipstools/cavp/cavp_ctr_drbg_test.cc",
- "src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc",
- "src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc",
- "src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc",
- "src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc",
- "src/util/fipstools/cavp/cavp_hmac_test.cc",
- "src/util/fipstools/cavp/cavp_kas_test.cc",
- "src/util/fipstools/cavp/cavp_keywrap_test.cc",
- "src/util/fipstools/cavp/cavp_main.cc",
- "src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc",
- "src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc",
- "src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc",
- "src/util/fipstools/cavp/cavp_sha_monte_test.cc",
- "src/util/fipstools/cavp/cavp_sha_test.cc",
- "src/util/fipstools/cavp/cavp_tdes_test.cc",
- "src/util/fipstools/cavp/cavp_test_util.cc",
- "src/util/fipstools/cavp/cavp_tlskdf_test.cc",
- ],
- target: {
- android: {
- compile_multilib: "both",
- },
- },
- multilib: {
- lib32: {
- suffix: "32",
- },
- },
-
- shared_libs: [
- "libcrypto",
- ],
-
- defaults: [
- "boringssl_test_support_sources",
- "boringssl_flags",
- ],
-}
-
// Used for ACVP testing for FIPS certification.
// Not installed on devices by default.
cc_binary {
@@ -584,6 +531,6 @@
"libcrypto",
],
srcs: [
- "src/util/fipstools/cavp/test_fips.c",
+ "src/util/fipstools/test_fips.c",
],
}
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index 9fc401a..95a1efc 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-345c86b1cfcc478a71a9a71f0206893fd16ae912
+81502beeddc5f116d44d0898c6c4a33057198db8
diff --git a/BUILD.generated.bzl b/BUILD.generated.bzl
index 8621d9b..bf9efa7 100644
--- a/BUILD.generated.bzl
+++ b/BUILD.generated.bzl
@@ -37,8 +37,6 @@
"src/crypto/fipsmodule/cipher/aead.c",
"src/crypto/fipsmodule/cipher/cipher.c",
"src/crypto/fipsmodule/cipher/e_aes.c",
- "src/crypto/fipsmodule/cipher/e_des.c",
- "src/crypto/fipsmodule/des/des.c",
"src/crypto/fipsmodule/dh/check.c",
"src/crypto/fipsmodule/dh/dh.c",
"src/crypto/fipsmodule/digest/digest.c",
@@ -218,6 +216,7 @@
"src/crypto/cpu_arm_linux.h",
"src/crypto/curve25519/curve25519_tables.h",
"src/crypto/curve25519/internal.h",
+ "src/crypto/des/internal.h",
"src/crypto/dsa/internal.h",
"src/crypto/ec_extra/internal.h",
"src/crypto/err/internal.h",
@@ -227,7 +226,7 @@
"src/crypto/fipsmodule/bn/rsaz_exp.h",
"src/crypto/fipsmodule/cipher/internal.h",
"src/crypto/fipsmodule/delocate.h",
- "src/crypto/fipsmodule/des/internal.h",
+ "src/crypto/fipsmodule/dh/internal.h",
"src/crypto/fipsmodule/digest/internal.h",
"src/crypto/fipsmodule/digest/md32_common.h",
"src/crypto/fipsmodule/ec/internal.h",
@@ -320,6 +319,7 @@
"src/crypto/cipher_extra/e_aesctrhmac.c",
"src/crypto/cipher_extra/e_aesgcmsiv.c",
"src/crypto/cipher_extra/e_chacha20poly1305.c",
+ "src/crypto/cipher_extra/e_des.c",
"src/crypto/cipher_extra/e_null.c",
"src/crypto/cipher_extra/e_rc2.c",
"src/crypto/cipher_extra/e_rc4.c",
@@ -338,6 +338,7 @@
"src/crypto/crypto.c",
"src/crypto/curve25519/curve25519.c",
"src/crypto/curve25519/spake25519.c",
+ "src/crypto/des/des.c",
"src/crypto/dh_extra/dh_asn1.c",
"src/crypto/dh_extra/params.c",
"src/crypto/digest_extra/digest_extra.c",
@@ -520,31 +521,69 @@
"src/tool/transport_common.h",
]
-crypto_sources_ios_aarch64 = [
- "ios-aarch64/crypto/chacha/chacha-armv8.S",
- "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S",
- "ios-aarch64/crypto/fipsmodule/armv8-mont.S",
- "ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
- "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
- "ios-aarch64/crypto/fipsmodule/sha1-armv8.S",
- "ios-aarch64/crypto/fipsmodule/sha256-armv8.S",
- "ios-aarch64/crypto/fipsmodule/sha512-armv8.S",
- "ios-aarch64/crypto/fipsmodule/vpaes-armv8.S",
- "ios-aarch64/crypto/test/trampoline-armv8.S",
+crypto_sources_apple_aarch64 = [
+ "apple-aarch64/crypto/chacha/chacha-armv8.S",
+ "apple-aarch64/crypto/fipsmodule/aesv8-armx64.S",
+ "apple-aarch64/crypto/fipsmodule/armv8-mont.S",
+ "apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
+ "apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
+ "apple-aarch64/crypto/fipsmodule/sha1-armv8.S",
+ "apple-aarch64/crypto/fipsmodule/sha256-armv8.S",
+ "apple-aarch64/crypto/fipsmodule/sha512-armv8.S",
+ "apple-aarch64/crypto/fipsmodule/vpaes-armv8.S",
+ "apple-aarch64/crypto/test/trampoline-armv8.S",
]
-crypto_sources_ios_arm = [
- "ios-arm/crypto/chacha/chacha-armv4.S",
- "ios-arm/crypto/fipsmodule/aesv8-armx32.S",
- "ios-arm/crypto/fipsmodule/armv4-mont.S",
- "ios-arm/crypto/fipsmodule/bsaes-armv7.S",
- "ios-arm/crypto/fipsmodule/ghash-armv4.S",
- "ios-arm/crypto/fipsmodule/ghashv8-armx32.S",
- "ios-arm/crypto/fipsmodule/sha1-armv4-large.S",
- "ios-arm/crypto/fipsmodule/sha256-armv4.S",
- "ios-arm/crypto/fipsmodule/sha512-armv4.S",
- "ios-arm/crypto/fipsmodule/vpaes-armv7.S",
- "ios-arm/crypto/test/trampoline-armv4.S",
+crypto_sources_apple_arm = [
+ "apple-arm/crypto/chacha/chacha-armv4.S",
+ "apple-arm/crypto/fipsmodule/aesv8-armx32.S",
+ "apple-arm/crypto/fipsmodule/armv4-mont.S",
+ "apple-arm/crypto/fipsmodule/bsaes-armv7.S",
+ "apple-arm/crypto/fipsmodule/ghash-armv4.S",
+ "apple-arm/crypto/fipsmodule/ghashv8-armx32.S",
+ "apple-arm/crypto/fipsmodule/sha1-armv4-large.S",
+ "apple-arm/crypto/fipsmodule/sha256-armv4.S",
+ "apple-arm/crypto/fipsmodule/sha512-armv4.S",
+ "apple-arm/crypto/fipsmodule/vpaes-armv7.S",
+ "apple-arm/crypto/test/trampoline-armv4.S",
+]
+
+crypto_sources_apple_x86 = [
+ "apple-x86/crypto/chacha/chacha-x86.S",
+ "apple-x86/crypto/fipsmodule/aesni-x86.S",
+ "apple-x86/crypto/fipsmodule/bn-586.S",
+ "apple-x86/crypto/fipsmodule/co-586.S",
+ "apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
+ "apple-x86/crypto/fipsmodule/ghash-x86.S",
+ "apple-x86/crypto/fipsmodule/md5-586.S",
+ "apple-x86/crypto/fipsmodule/sha1-586.S",
+ "apple-x86/crypto/fipsmodule/sha256-586.S",
+ "apple-x86/crypto/fipsmodule/sha512-586.S",
+ "apple-x86/crypto/fipsmodule/vpaes-x86.S",
+ "apple-x86/crypto/fipsmodule/x86-mont.S",
+ "apple-x86/crypto/test/trampoline-x86.S",
+]
+
+crypto_sources_apple_x86_64 = [
+ "apple-x86_64/crypto/chacha/chacha-x86_64.S",
+ "apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
+ "apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/aesni-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/ghash-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/md5-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
+ "apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
+ "apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/rsaz-avx2.S",
+ "apple-x86_64/crypto/fipsmodule/sha1-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/sha256-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/sha512-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
+ "apple-x86_64/crypto/fipsmodule/x86_64-mont.S",
+ "apple-x86_64/crypto/fipsmodule/x86_64-mont5.S",
+ "apple-x86_64/crypto/test/trampoline-x86_64.S",
]
crypto_sources_linux_aarch64 = [
@@ -621,44 +660,6 @@
"src/crypto/hrss/asm/poly_rq_mul.S",
]
-crypto_sources_mac_x86 = [
- "mac-x86/crypto/chacha/chacha-x86.S",
- "mac-x86/crypto/fipsmodule/aesni-x86.S",
- "mac-x86/crypto/fipsmodule/bn-586.S",
- "mac-x86/crypto/fipsmodule/co-586.S",
- "mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
- "mac-x86/crypto/fipsmodule/ghash-x86.S",
- "mac-x86/crypto/fipsmodule/md5-586.S",
- "mac-x86/crypto/fipsmodule/sha1-586.S",
- "mac-x86/crypto/fipsmodule/sha256-586.S",
- "mac-x86/crypto/fipsmodule/sha512-586.S",
- "mac-x86/crypto/fipsmodule/vpaes-x86.S",
- "mac-x86/crypto/fipsmodule/x86-mont.S",
- "mac-x86/crypto/test/trampoline-x86.S",
-]
-
-crypto_sources_mac_x86_64 = [
- "mac-x86_64/crypto/chacha/chacha-x86_64.S",
- "mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
- "mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
- "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/md5-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
- "mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
- "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S",
- "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/sha256-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/sha512-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
- "mac-x86_64/crypto/fipsmodule/x86_64-mont.S",
- "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S",
- "mac-x86_64/crypto/test/trampoline-x86_64.S",
-]
-
crypto_sources_win_aarch64 = [
"win-aarch64/crypto/chacha/chacha-armv8.S",
"win-aarch64/crypto/fipsmodule/aesv8-armx64.S",
diff --git a/BUILD.generated_tests.bzl b/BUILD.generated_tests.bzl
index 532ca40..51a5ea4 100644
--- a/BUILD.generated_tests.bzl
+++ b/BUILD.generated_tests.bzl
@@ -12,6 +12,7 @@
"src/crypto/cpu_arm_linux.h",
"src/crypto/curve25519/curve25519_tables.h",
"src/crypto/curve25519/internal.h",
+ "src/crypto/des/internal.h",
"src/crypto/dsa/internal.h",
"src/crypto/ec_extra/internal.h",
"src/crypto/err/internal.h",
@@ -21,7 +22,7 @@
"src/crypto/fipsmodule/bn/rsaz_exp.h",
"src/crypto/fipsmodule/cipher/internal.h",
"src/crypto/fipsmodule/delocate.h",
- "src/crypto/fipsmodule/des/internal.h",
+ "src/crypto/fipsmodule/dh/internal.h",
"src/crypto/fipsmodule/digest/internal.h",
"src/crypto/fipsmodule/digest/md32_common.h",
"src/crypto/fipsmodule/ec/internal.h",
diff --git a/android-sources.cmake b/android-sources.cmake
index 841eed9..15079b3 100644
--- a/android-sources.cmake
+++ b/android-sources.cmake
@@ -75,6 +75,7 @@
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesctrhmac.c
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesgcmsiv.c
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_chacha20poly1305.c
+ ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_des.c
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_null.c
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc2.c
${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc4.c
@@ -93,6 +94,7 @@
${BORINGSSL_ROOT}src/crypto/crypto.c
${BORINGSSL_ROOT}src/crypto/curve25519/curve25519.c
${BORINGSSL_ROOT}src/crypto/curve25519/spake25519.c
+ ${BORINGSSL_ROOT}src/crypto/des/des.c
${BORINGSSL_ROOT}src/crypto/dh_extra/dh_asn1.c
${BORINGSSL_ROOT}src/crypto/dh_extra/params.c
${BORINGSSL_ROOT}src/crypto/digest_extra/digest_extra.c
@@ -388,30 +390,66 @@
${BORINGSSL_ROOT}src/ssl/ssl_c_test.c
${BORINGSSL_ROOT}src/ssl/ssl_test.cc
)
-set(crypto_sources_ios_aarch64
- ${BORINGSSL_ROOT}ios-aarch64/crypto/chacha/chacha-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/aesv8-armx64.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/armv8-mont.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha1-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha256-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha512-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/vpaes-armv8.S
- ${BORINGSSL_ROOT}ios-aarch64/crypto/test/trampoline-armv8.S
+set(crypto_sources_apple_aarch64
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/chacha/chacha-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/armv8-mont.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha1-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha256-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha512-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
+ ${BORINGSSL_ROOT}apple-aarch64/crypto/test/trampoline-armv8.S
)
-set(crypto_sources_ios_arm
- ${BORINGSSL_ROOT}ios-arm/crypto/chacha/chacha-armv4.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/aesv8-armx32.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/armv4-mont.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/bsaes-armv7.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghash-armv4.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghashv8-armx32.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha1-armv4-large.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha256-armv4.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha512-armv4.S
- ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/vpaes-armv7.S
- ${BORINGSSL_ROOT}ios-arm/crypto/test/trampoline-armv4.S
+set(crypto_sources_apple_arm
+ ${BORINGSSL_ROOT}apple-arm/crypto/chacha/chacha-armv4.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/aesv8-armx32.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/armv4-mont.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/bsaes-armv7.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghash-armv4.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghashv8-armx32.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha1-armv4-large.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha256-armv4.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha512-armv4.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/vpaes-armv7.S
+ ${BORINGSSL_ROOT}apple-arm/crypto/test/trampoline-armv4.S
+)
+set(crypto_sources_apple_x86
+ ${BORINGSSL_ROOT}apple-x86/crypto/chacha/chacha-x86.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/aesni-x86.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/bn-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/co-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-x86.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/md5-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha1-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha256-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha512-586.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/vpaes-x86.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/x86-mont.S
+ ${BORINGSSL_ROOT}apple-x86/crypto/test/trampoline-x86.S
+)
+set(crypto_sources_apple_x86_64
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/chacha/chacha-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/md5-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
+ ${BORINGSSL_ROOT}apple-x86_64/crypto/test/trampoline-x86_64.S
)
set(crypto_sources_linux_aarch64
${BORINGSSL_ROOT}linux-aarch64/crypto/chacha/chacha-armv8.S
@@ -482,42 +520,6 @@
${BORINGSSL_ROOT}linux-x86_64/crypto/test/trampoline-x86_64.S
${BORINGSSL_ROOT}src/crypto/hrss/asm/poly_rq_mul.S
)
-set(crypto_sources_mac_x86
- ${BORINGSSL_ROOT}mac-x86/crypto/chacha/chacha-x86.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/aesni-x86.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/bn-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/co-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-x86.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/md5-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha1-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha256-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha512-586.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/vpaes-x86.S
- ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/x86-mont.S
- ${BORINGSSL_ROOT}mac-x86/crypto/test/trampoline-x86.S
-)
-set(crypto_sources_mac_x86_64
- ${BORINGSSL_ROOT}mac-x86_64/crypto/chacha/chacha-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/md5-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rsaz-avx2.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha1-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont5.S
- ${BORINGSSL_ROOT}mac-x86_64/crypto/test/trampoline-x86_64.S
-)
set(crypto_sources_win_aarch64
${BORINGSSL_ROOT}win-aarch64/crypto/chacha/chacha-armv8.S
${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/aesv8-armx64.S
diff --git a/apple-aarch64/crypto/chacha/chacha-armv8.S b/apple-aarch64/crypto/chacha/chacha-armv8.S
new file mode 100644
index 0000000..dd992a2
--- /dev/null
+++ b/apple-aarch64/crypto/chacha/chacha-armv8.S
@@ -0,0 +1,1992 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+
+.private_extern _OPENSSL_armcap_P
+
+.section __TEXT,__const
+
+.align 5
+Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+Lone:
+.long 1,0,0,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.text
+
+.globl _ChaCha20_ctr32
+.private_extern _ChaCha20_ctr32
+
+.align 5
+_ChaCha20_ctr32:
+ AARCH64_VALID_CALL_TARGET
+ cbz x2,Labort
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+ adrp x5,_OPENSSL_armcap_P@PAGE
+#endif
+ cmp x2,#192
+ b.lo Lshort
+ ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+Lshort:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __AARCH64EB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+Labort:
+ ret
+
+.align 4
+Ltail:
+ add x2,x2,#64
+Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+
+.align 5
+ChaCha20_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b Last_neon
+
+Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b Last_neon
+Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b Last_neon
+
+.align 4
+Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+ChaCha20_512_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,Lsigma@PAGE
+ add x5,x5,Lsigma@PAGEOFF
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __AARCH64EB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __AARCH64EB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b Loop_outer
+
+Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S b/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
new file mode 100644
index 0000000..50d7dea
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -0,0 +1,799 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.section __TEXT,__const
+.align 5
+Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl _aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+
+.align 5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x3,#-1
+ cmp x0,#0
+ b.eq Lenc_key_abort
+ cmp x2,#0
+ b.eq Lenc_key_abort
+ mov x3,#-2
+ cmp w1,#128
+ b.lt Lenc_key_abort
+ cmp w1,#256
+ b.gt Lenc_key_abort
+ tst w1,#0x3f
+ b.ne Lenc_key_abort
+
+ adrp x3,Lrcon@PAGE
+ add x3,x3,Lrcon@PAGEOFF
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt Loop128
+ b.eq L192
+ b L256
+
+.align 4
+Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b Ldone
+
+.align 4
+L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b Ldone
+
+.align 4
+L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b Loop256
+
+Ldone:
+ str w12,[x2]
+ mov x3,#0
+
+Lenc_key_abort:
+ mov x0,x3 // return value
+ ldr x29,[sp],#16
+ ret
+
+
+.globl _aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+
+.align 5
+_aes_hw_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl Lenc_key
+
+ cmp x0,#0
+ b.ne Ldec_key_abort
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+Ldec_key_abort:
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+
+.align 5
+_aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_enc:
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_enc
+
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl _aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+
+.align 5
+_aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+Loop_dec:
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt Loop_dec
+
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+
+.globl _aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+
+.align 5
+_aes_hw_cbc_encrypt:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq Lcbc_enc128
+
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b Lenter_cbc_enc
+
+.align 4
+Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+
+Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+
+.align 5
+Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b Lenter_cbc_enc128
+Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b Lcbc_done
+.align 5
+Lcbc_dec:
+ ld1 {v18.16b},[x0],#16
+ subs x2,x2,#32 // bias
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v19.16b,v18.16b,v18.16b
+ b.lo Lcbc_dec_tail
+
+ orr v1.16b,v18.16b,v18.16b
+ ld1 {v18.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v18.16b
+ // are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v18.16b,v18.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v18.16b},[x1],#16
+ orr v18.16b,v19.16b,v19.16b
+ b.hs Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq Lcbc_done
+ nop
+
+Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ b.eq Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b Lcbc_done
+
+Lcbc_dec_one:
+ eor v5.16b,v5.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+
+Lcbc_done:
+ st1 {v6.16b},[x4]
+Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+
+.globl _aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+
+.align 5
+_aes_hw_ctr32_encrypt_blocks:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov lines
+ // could write to v1.16b and v18.16b directly, but that trips this bugs.
+ // We write to v6.16b and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+ rev w8, w8
+#endif
+ add w10, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v6.s[3],w10
+ add w8, w8, #2
+ orr v1.16b,v6.16b,v6.16b
+ b.ls Lctr32_tail
+ rev w12, w8
+ mov v6.s[3],w12
+ sub x2,x2,#3 // bias
+ orr v18.16b,v6.16b,v6.16b
+ b Loop3x_ctr32
+
+.align 4
+Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ add w9,w8,#1
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ rev w9,w9
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
+ eor v19.16b,v19.16b,v7.16b
+ mov v6.s[3], w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ orr v0.16b,v6.16b,v6.16b
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ mov v6.s[3], w10
+ rev w12,w8
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ orr v1.16b,v6.16b,v6.16b
+ mov v6.s[3], w12
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ orr v18.16b,v6.16b,v6.16b
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq Lctr32_done
+ st1 {v3.16b},[x1]
+
+Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/armv8-mont.S b/apple-aarch64/crypto/fipsmodule/armv8-mont.S
new file mode 100644
index 0000000..2493ae0
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -0,0 +1,1433 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl _bn_mul_mont
+.private_extern _bn_mul_mont
+
+.align 5
+_bn_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,L1st_skip
+
+L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,L1st
+
+L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,Linner_skip
+
+Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,Linner
+
+Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+__bn_sqr8x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+ // only from bn_mul_mont which has already signed the return address.
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b Lsqr8x_zero_start
+
+Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_mul
+
+.align 4
+Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b Lsqr8x_outer_loop
+
+.align 4
+Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b Lsqr8x_tail
+
+.align 4
+Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b Lsqr8x_done
+
+.align 4
+Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.align 5
+__bn_mul4x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+ // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+ // return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_reduction
+
+ cbz x10,Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_1st_tail
+
+.align 5
+Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b Loop_mul4x_tail
+
+.align 4
+Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b Loop_mul4x_reduction
+
+.align 4
+Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b Lmul4x_done
+
+.align 4
+Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 0000000..5441afc
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,343 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl _gcm_init_neon
+.private_extern _gcm_init_neon
+
+.align 4
+_gcm_init_neon:
+ AARCH64_VALID_CALL_TARGET
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+
+
+.globl _gcm_gmult_neon
+.private_extern _gcm_gmult_neon
+
+.align 4
+_gcm_gmult_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b Lgmult_neon
+
+
+.globl _gcm_ghash_neon
+.private_extern _gcm_ghash_neon
+
+.align 4
+_gcm_ghash_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, Lmasks@PAGE // load constants
+ add x9, x9, Lmasks@PAGEOFF
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+
+
+.section __TEXT,__const
+.align 4
+Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
new file mode 100644
index 0000000..0ba0cdd
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -0,0 +1,573 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.globl _gcm_init_v8
+.private_extern _gcm_init_v8
+
+.align 4
+_gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
+ ext v3.16b,v17.16b,v17.16b,#8
+ ushr v18.2d,v19.2d,#63
+ dup v17.4s,v17.s[1]
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
+ sshr v17.4s,v17.4s,#31 //broadcast carry bit
+ and v18.16b,v18.16b,v16.16b
+ shl v3.2d,v3.2d,#1
+ ext v18.16b,v18.16b,v18.16b,#8
+ and v16.16b,v16.16b,v17.16b
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
+ //calculate H^3 and H^4
+ pmull v0.1q,v20.1d, v22.1d
+ pmull v5.1q,v22.1d,v22.1d
+ pmull2 v2.1q,v20.2d, v22.2d
+ pmull2 v7.1q,v22.2d,v22.2d
+ pmull v1.1q,v16.1d,v17.1d
+ pmull v6.1q,v17.1d,v17.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v20.16b, v0.16b,v18.16b //H^3
+ eor v22.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v20.16b
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ret
+
+.globl _gcm_gmult_v8
+.private_extern _gcm_gmult_v8
+
+.align 4
+_gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x0] //load Xi
+ movi v19.16b,#0xe1
+ ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
+ shl v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v3.16b,v17.16b,v17.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.globl _gcm_ghash_v8
+.private_extern _gcm_ghash_v8
+
+.align 4
+_gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ cmp x3,#64
+ b.hs Lgcm_ghash_v8_4x
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //algorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude overstepping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+ rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b Loop_mod2x_v8
+
+.align 4
+Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __AARCH64EB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq Ldone_v8 //is x3 zero?
+Lodd_tail_v8:
+ ext v18.16b,v0.16b,v0.16b,#8
+ eor v3.16b,v3.16b,v0.16b //inp^=Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+
+.align 4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+ ext v25.16b,v7.16b,v7.16b,#8
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#128
+ b.lo Ltail4x
+
+ b Loop4x
+
+.align 4
+Loop4x:
+ eor v16.16b,v4.16b,v0.16b
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+ ext v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ ext v25.16b,v7.16b,v7.16b,#8
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ ext v24.16b,v6.16b,v6.16b,#8
+ eor v1.16b,v1.16b,v30.16b
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ eor v1.16b,v1.16b,v17.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ eor v1.16b,v1.16b,v18.16b
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ eor v0.16b,v1.16b,v18.16b
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v0.16b,v0.16b,v18.16b
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#64
+ b.hs Loop4x
+
+Ltail4x:
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+
+ adds x3,x3,#64
+ b.eq Ldone4x
+
+ cmp x3,#32
+ b.lo Lone
+ b.eq Ltwo
+Lthree:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d,v6.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ pmull v29.1q,v20.1d,v24.1d //H·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v31.1q,v20.2d,v24.2d
+ pmull v30.1q,v21.1d,v6.1d
+ eor v0.16b,v0.16b,v18.16b
+ pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull2 v23.1q,v22.2d,v23.2d
+ eor v16.16b,v4.16b,v0.16b
+ pmull2 v5.1q,v21.2d,v5.2d
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v26.2d,v3.2d
+ pmull v1.1q,v27.1d,v16.1d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Ltwo:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v5.16b,v5.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull v29.1q,v20.1d,v23.1d //H·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull2 v31.1q,v20.2d,v23.2d
+ pmull v30.1q,v21.1d,v5.1d
+
+ pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v22.2d,v3.2d
+ pmull2 v1.1q,v21.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b Ldone4x
+
+.align 4
+Lone:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __AARCH64EB__
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v20.2d,v3.2d
+ pmull v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+ rev64 v0.16b,v0.16b
+#endif
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha1-armv8.S b/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
new file mode 100644
index 0000000..62ba800
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -0,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+
+.private_extern _OPENSSL_armcap_P
+.globl _sha1_block_data_order
+.private_extern _sha1_block_data_order
+
+.align 6
+_sha1_block_data_order:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+ adrp x16,_OPENSSL_armcap_P@PAGE
+#endif
+ ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+ tst w16,#ARMV8_SHA1
+ b.ne Lv8_entry
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp w20,w21,[x0]
+ ldp w22,w23,[x0,#8]
+ ldr w24,[x0,#16]
+
+Loop:
+ ldr x3,[x1],#64
+ movz w28,#0x7999
+ sub x2,x2,#1
+ movk w28,#0x5a82,lsl#16
+#ifdef __AARCH64EB__
+ ror x3,x3,#32
+#else
+ rev32 x3,x3
+#endif
+ add w24,w24,w28 // warm it up
+ add w24,w24,w3
+ lsr x4,x3,#32
+ ldr x5,[x1,#-56]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w4 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x5,x5,#32
+#else
+ rev32 x5,x5
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w5 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x6,x5,#32
+ ldr x7,[x1,#-48]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w6 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x7,x7,#32
+#else
+ rev32 x7,x7
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w7 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x8,x7,#32
+ ldr x9,[x1,#-40]
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w8 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x9,x9,#32
+#else
+ rev32 x9,x9
+#endif
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w9 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ lsr x10,x9,#32
+ ldr x11,[x1,#-32]
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w10 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x11,x11,#32
+#else
+ rev32 x11,x11
+#endif
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w11 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ lsr x12,x11,#32
+ ldr x13,[x1,#-24]
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w12 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x13,x13,#32
+#else
+ rev32 x13,x13
+#endif
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w13 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ lsr x14,x13,#32
+ ldr x15,[x1,#-16]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w14 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x15,x15,#32
+#else
+ rev32 x15,x15
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w15 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x16,x15,#32
+ ldr x17,[x1,#-8]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w16 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __AARCH64EB__
+ ror x17,x17,#32
+#else
+ rev32 x17,x17
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w17 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x19,x17,#32
+ eor w3,w3,w5
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w3,w3,w11
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w3,w3,w16
+ ror w22,w22,#2
+ add w24,w24,w19 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ eor w4,w4,w12
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ eor w4,w4,w17
+ ror w21,w21,#2
+ add w23,w23,w3 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ eor w5,w5,w13
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ eor w5,w5,w19
+ ror w20,w20,#2
+ add w22,w22,w4 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ eor w6,w6,w14
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ eor w6,w6,w3
+ ror w24,w24,#2
+ add w21,w21,w5 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ eor w7,w7,w15
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ eor w7,w7,w4
+ ror w23,w23,#2
+ add w20,w20,w6 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ movz w28,#0xeba1
+ movk w28,#0x6ed9,lsl#16
+ eor w8,w8,w10
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w8,w8,w16
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w8,w8,w5
+ ror w22,w22,#2
+ add w24,w24,w7 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w9,w9,w6
+ add w23,w23,w8 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w10,w10,w7
+ add w22,w22,w9 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w11,w11,w8
+ add w21,w21,w10 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w12,w12,w9
+ add w20,w20,w11 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w13,w13,w10
+ add w24,w24,w12 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w14,w14,w11
+ add w23,w23,w13 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w15,w15,w12
+ add w22,w22,w14 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w16,w16,w13
+ add w21,w21,w15 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w17,w17,w14
+ add w20,w20,w16 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w19,w19,w15
+ add w24,w24,w17 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w3,w3,w16
+ add w23,w23,w19 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w4,w4,w17
+ add w22,w22,w3 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w5,w5,w19
+ add w21,w21,w4 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w6,w6,w3
+ add w20,w20,w5 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w7,w7,w4
+ add w24,w24,w6 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w8,w8,w5
+ add w23,w23,w7 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w9,w9,w6
+ add w22,w22,w8 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w10,w10,w7
+ add w21,w21,w9 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w11,w11,w8
+ add w20,w20,w10 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ movz w28,#0xbcdc
+ movk w28,#0x8f1b,lsl#16
+ eor w12,w12,w14
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w12,w12,w9
+ add w24,w24,w11 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w13,w13,w15
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w13,w13,w5
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w13,w13,w10
+ add w23,w23,w12 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w14,w14,w16
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w14,w14,w6
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w14,w14,w11
+ add w22,w22,w13 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w15,w15,w17
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w15,w15,w7
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w15,w15,w12
+ add w21,w21,w14 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w16,w16,w19
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w16,w16,w8
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w16,w16,w13
+ add w20,w20,w15 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w17,w17,w3
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w17,w17,w9
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w17,w17,w14
+ add w24,w24,w16 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w19,w19,w4
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w19,w19,w10
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w19,w19,w15
+ add w23,w23,w17 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w3,w3,w5
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w3,w3,w11
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w3,w3,w16
+ add w22,w22,w19 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w4,w4,w6
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w4,w4,w12
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w4,w4,w17
+ add w21,w21,w3 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w5,w5,w7
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w5,w5,w13
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w5,w5,w19
+ add w20,w20,w4 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w6,w6,w8
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w6,w6,w14
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w6,w6,w3
+ add w24,w24,w5 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w7,w7,w9
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w7,w7,w15
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w7,w7,w4
+ add w23,w23,w6 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w8,w8,w10
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w8,w8,w16
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w8,w8,w5
+ add w22,w22,w7 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w9,w9,w11
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w9,w9,w17
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w9,w9,w6
+ add w21,w21,w8 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w10,w10,w12
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w10,w10,w19
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w10,w10,w7
+ add w20,w20,w9 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w11,w11,w13
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w11,w11,w3
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w11,w11,w8
+ add w24,w24,w10 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w12,w12,w14
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w12,w12,w4
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w12,w12,w9
+ add w23,w23,w11 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w13,w13,w15
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w13,w13,w5
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w13,w13,w10
+ add w22,w22,w12 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w14,w14,w16
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w14,w14,w6
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w14,w14,w11
+ add w21,w21,w13 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w15,w15,w17
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w15,w15,w7
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w15,w15,w12
+ add w20,w20,w14 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ movz w28,#0xc1d6
+ movk w28,#0xca62,lsl#16
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w16,w16,w19
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w16,w16,w8
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w16,w16,w13
+ add w24,w24,w15 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w17,w17,w14
+ add w23,w23,w16 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w19,w19,w15
+ add w22,w22,w17 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w3,w3,w16
+ add w21,w21,w19 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w4,w4,w17
+ add w20,w20,w3 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w5,w5,w19
+ add w24,w24,w4 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w6,w6,w3
+ add w23,w23,w5 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w7,w7,w4
+ add w22,w22,w6 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w8,w8,w5
+ add w21,w21,w7 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w9,w9,w6
+ add w20,w20,w8 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w10,w10,w7
+ add w24,w24,w9 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w11,w11,w8
+ add w23,w23,w10 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w12,w12,w9
+ add w22,w22,w11 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w13,w13,w10
+ add w21,w21,w12 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w14,w14,w11
+ add w20,w20,w13 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w15,w15,w12
+ add w24,w24,w14 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w16,w16,w13
+ add w23,w23,w15 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w17,w17,w14
+ add w22,w22,w16 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w19,w19,w15
+ add w21,w21,w17 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ ldp w4,w5,[x0]
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w19 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ldp w6,w7,[x0,#8]
+ eor w25,w24,w22
+ ror w27,w21,#27
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ ldr w8,[x0,#16]
+ add w20,w20,w25 // e+=F(b,c,d)
+ add w21,w21,w5
+ add w22,w22,w6
+ add w20,w20,w4
+ add w23,w23,w7
+ add w24,w24,w8
+ stp w20,w21,[x0]
+ stp w22,w23,[x0,#8]
+ str w24,[x0,#16]
+ cbnz x2,Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+
+
+.align 6
+sha1_block_armv8:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adrp x4,Lconst@PAGE
+ add x4,x4,Lconst@PAGEOFF
+ eor v1.16b,v1.16b,v1.16b
+ ld1 {v0.4s},[x0],#16
+ ld1 {v1.s}[0],[x0]
+ sub x0,x0,#16
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+
+ add v20.4s,v16.4s,v4.4s
+ rev32 v6.16b,v6.16b
+ orr v22.16b,v0.16b,v0.16b // offload
+
+ add v21.4s,v16.4s,v5.4s
+ rev32 v7.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b
+.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
+ add v20.4s,v16.4s,v6.4s
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 1
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v16.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 2
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v16.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 3
+.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 4
+.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 5
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 6
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 7
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 8
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 9
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 10
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 11
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 12
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 13
+.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 14
+.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v4.4s
+.long 0x5e281885 //sha1su1 v5.16b,v4.16b
+.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 15
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v5.4s
+.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 16
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v6.4s
+.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 17
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+
+.long 0x5e280803 //sha1h v3.16b,v0.16b // 18
+.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+
+.long 0x5e280802 //sha1h v2.16b,v0.16b // 19
+.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+
+ add v1.4s,v1.4s,v2.4s
+ add v0.4s,v0.4s,v22.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s},[x0],#16
+ st1 {v1.s}[0],[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+.section __TEXT,__const
+.align 6
+Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha256-armv8.S b/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
new file mode 100644
index 0000000..b40b260
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -0,0 +1,1212 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.private_extern _OPENSSL_armcap_P
+.globl _sha256_block_data_order
+.private_extern _sha256_block_data_order
+
+.align 6
+_sha256_block_data_order:
+ AARCH64_VALID_CALL_TARGET
+#ifndef __KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+ adrp x16,_OPENSSL_armcap_P@PAGE
+#endif
+ ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+ tst w16,#ARMV8_SHA256
+ b.ne Lv8_entry
+#endif
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adrp x30,LK256@PAGE
+ add x30,x30,LK256@PAGEOFF
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section __TEXT,__const
+.align 6
+
+LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+
+.align 6
+sha256_block_armv8:
+Lv8_entry:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adrp x3,LK256@PAGE
+ add x3,x3,LK256@PAGEOFF
+
+Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.long 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha512-armv8.S b/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
new file mode 100644
index 0000000..b2d366d
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -0,0 +1,1614 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+// Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significantly faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.private_extern _OPENSSL_armcap_P
+.globl _sha512_block_data_order
+.private_extern _sha512_block_data_order
+
+.align 6
+_sha512_block_data_order:
+ AARCH64_VALID_CALL_TARGET
+#ifndef __KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+ adrp x16,_OPENSSL_armcap_P@PAGE
+#endif
+ ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+ tst w16,#ARMV8_SHA512
+ b.ne Lv8_entry
+#endif
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adrp x30,LK512@PAGE
+ add x30,x30,LK512@PAGEOFF
+ stp x0,x2,[x29,#96]
+
+Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __AARCH64EB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __AARCH64EB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.section __TEXT,__const
+.align 6
+
+LK512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0 // terminator
+
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+
+.align 6
+sha512_block_armv8:
+Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context
+ adrp x3,LK512@PAGE
+ add x3,x3,LK512@PAGEOFF
+
+ rev64 v16.16b,v16.16b
+ rev64 v17.16b,v17.16b
+ rev64 v18.16b,v18.16b
+ rev64 v19.16b,v19.16b
+ rev64 v20.16b,v20.16b
+ rev64 v21.16b,v21.16b
+ rev64 v22.16b,v22.16b
+ rev64 v23.16b,v23.16b
+ b Loop_hw
+
+.align 4
+Loop_hw:
+ ld1 {v24.2d},[x3],#16
+ subs x2,x2,#1
+ sub x4,x1,#128
+ orr v26.16b,v0.16b,v0.16b // offload
+ orr v27.16b,v1.16b,v1.16b
+ orr v28.16b,v2.16b,v2.16b
+ orr v29.16b,v3.16b,v3.16b
+ csel x1,x1,x4,ne // conditional rewind
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08230 //sha512su0 v16.16b,v17.16b
+ ext v7.16b,v20.16b,v21.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08251 //sha512su0 v17.16b,v18.16b
+ ext v7.16b,v21.16b,v22.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec08272 //sha512su0 v18.16b,v19.16b
+ ext v7.16b,v22.16b,v23.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08293 //sha512su0 v19.16b,v20.16b
+ ext v7.16b,v23.16b,v16.16b,#8
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082b4 //sha512su0 v20.16b,v21.16b
+ ext v7.16b,v16.16b,v17.16b,#8
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec082d5 //sha512su0 v21.16b,v22.16b
+ ext v7.16b,v17.16b,v18.16b,#8
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v25.2d},[x3],#16
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xcec082f6 //sha512su0 v22.16b,v23.16b
+ ext v7.16b,v18.16b,v19.16b,#8
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v24.2d},[x3],#16
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xcec08217 //sha512su0 v23.16b,v16.16b
+ ext v7.16b,v19.16b,v20.16b,#8
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v16.2d
+ ld1 {v16.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v16.16b,v16.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v17.2d
+ ld1 {v17.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v17.16b,v17.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v18.2d
+ ld1 {v18.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v18.16b,v18.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v19.2d
+ ld1 {v19.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v2.16b,v3.16b,#8
+ ext v6.16b,v1.16b,v2.16b,#8
+ add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b
+ rev64 v19.16b,v19.16b
+ add v4.2d,v1.2d,v3.2d // "D + T1"
+.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v20.2d
+ ld1 {v20.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v4.16b,v2.16b,#8
+ ext v6.16b,v0.16b,v4.16b,#8
+ add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b
+ rev64 v20.16b,v20.16b
+ add v1.2d,v0.2d,v2.2d // "D + T1"
+.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b
+ ld1 {v24.2d},[x3],#16
+ add v25.2d,v25.2d,v21.2d
+ ld1 {v21.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v1.16b,v4.16b,#8
+ ext v6.16b,v3.16b,v1.16b,#8
+ add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b
+ rev64 v21.16b,v21.16b
+ add v0.2d,v3.2d,v4.2d // "D + T1"
+.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b
+ ld1 {v25.2d},[x3],#16
+ add v24.2d,v24.2d,v22.2d
+ ld1 {v22.16b},[x1],#16 // load next input
+ ext v24.16b,v24.16b,v24.16b,#8
+ ext v5.16b,v0.16b,v1.16b,#8
+ ext v6.16b,v2.16b,v0.16b,#8
+ add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]"
+.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b
+ rev64 v22.16b,v22.16b
+ add v3.2d,v2.2d,v1.2d // "D + T1"
+.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b
+ sub x3,x3,#80*8 // rewind
+ add v25.2d,v25.2d,v23.2d
+ ld1 {v23.16b},[x1],#16 // load next input
+ ext v25.16b,v25.16b,v25.16b,#8
+ ext v5.16b,v3.16b,v0.16b,#8
+ ext v6.16b,v4.16b,v3.16b,#8
+ add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]"
+.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b
+ rev64 v23.16b,v23.16b
+ add v2.2d,v4.2d,v0.2d // "D + T1"
+.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b
+ add v0.2d,v0.2d,v26.2d // accumulate
+ add v1.2d,v1.2d,v27.2d
+ add v2.2d,v2.2d,v28.2d
+ add v3.2d,v3.2d,v29.2d
+
+ cbnz x2,Loop_hw
+
+ st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context
+
+ ldr x29,[sp],#16
+ ret
+
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S b/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
new file mode 100644
index 0000000..6dfc25d
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
@@ -0,0 +1,1232 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.section __TEXT,__const
+
+
+.align 7 // totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward: // mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward: // mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr: // sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv: // inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt: // input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo: // sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1: // sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2: // sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+Lk_dipt: // decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo: // decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9: // decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd: // decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb: // decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe: // decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+Lk_dksd: // decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb: // decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9: // decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon: // rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt: // output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew: // deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+
+.align 6
+
+.text
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+
+.align 4
+_vpaes_encrypt_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v17.16b, #0x0f
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2
+ ret
+
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+
+.align 4
+_vpaes_encrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward@PAGE+16
+ add x11, x11, Lk_mc_forward@PAGEOFF+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+
+
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+
+.align 4
+_vpaes_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, Lk_mc_forward@PAGE+16
+ add x11, x11, Lk_mc_forward@PAGEOFF+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {v20.16b}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {v21.16b}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Lenc_2x_entry
+
+.align 4
+Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {v25.16b}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v24.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {v27.16b}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {v26.16b}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v23.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+
+
+
+.align 4
+_vpaes_decrypt_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v17.16b, #0x0f
+ adrp x11, Lk_dipt@PAGE
+ add x11, x11, Lk_dipt@PAGEOFF
+ ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe
+ ret
+
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+
+.align 4
+_vpaes_decrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr@PAGE
+ add x10, x10, Lk_sr@PAGEOFF
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward@PAGE+48
+ add x10, x10, Lk_mc_forward@PAGEOFF+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+
+
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+
+.align 4
+_vpaes_decrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+// v14-v15 input, v0-v1 output
+
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, Lk_sr@PAGE
+ add x10, x10, Lk_sr@PAGEOFF
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, Lk_mc_forward@PAGE+48
+ add x10, x10, Lk_mc_forward@PAGEOFF+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {v20.16b},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {v21.16b},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b Ldec_2x_entry
+
+.align 4
+Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {v24.16b}, v10.16b
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {v25.16b}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {v26.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {v27.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {v28.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {v29.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {v30.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {v31.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {v23.16b}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+
+.align 4
+_vpaes_key_preheat:
+ adrp x10, Lk_inv@PAGE
+ add x10, x10, Lk_inv@PAGEOFF
+ movi v16.16b, #0x5b // Lk_s63
+ adrp x11, Lk_sb1@PAGE
+ add x11, x11, Lk_sb1@PAGEOFF
+ movi v17.16b, #0x0f // Lk_s0F
+ ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt
+ adrp x10, Lk_dksd@PAGE
+ add x10, x10, Lk_dksd@PAGEOFF
+ ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1
+ adrp x11, Lk_mc_forward@PAGE
+ add x11, x11, Lk_mc_forward@PAGEOFF
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9
+ ld1 {v8.2d}, [x10] // Lk_rcon
+ ld1 {v9.2d}, [x11] // Lk_mc_forward[0]
+ ret
+
+
+
+.align 4
+_vpaes_schedule_core:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10
+ add x10, x10, Lk_sr@PAGEOFF
+
+ add x8, x8, x10
+ cbnz w3, Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor $0x30, %r8
+
+Lschedule_go:
+ cmp w1, #192 // cmp $192, %esi
+ b.hi Lschedule_256
+ b.eq Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+Lschedule_128:
+ mov x0, #10 // mov $10, %esi
+
+Loop_schedule_128:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+Lschedule_192:
+ sub x0, x0, #8
+ ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov x0, #4 // mov $4, %esi
+
+Loop_schedule_192:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+Lschedule_256:
+ ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov x0, #7 // mov $7, %esi
+
+Loop_schedule_256:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz x0, Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew
+ add x11, x11, Lk_deskew@PAGEOFF
+
+ cbnz w3, Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform
+ add x11, x11, Lk_opt@PAGEOFF
+ add x2, x2, #32 // add $32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+Lschedule_mangle_last_dec:
+ ld1 {v20.2d,v21.2d}, [x11] // reload constants
+ sub x2, x2, #16 // add $-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
+ ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz w3, Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
+ add x2, x2, #16 // add $16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b Lschedule_mangle_both
+.align 4
+Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub x2, x2, #16 // add $-16, %rdx
+
+Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #48 // add $-16, %r8
+ and x8, x8, #~(1<<6) // and $0x30, %r8
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ ret
+
+
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+
+.align 4
+_vpaes_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov w3, #0 // mov $0,%ecx
+ mov x8, #0x30 // mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+
+.align 4
+_vpaes_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl $4,%eax
+ add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
+ add x2, x2, x9
+
+ mov w3, #1 // mov $1,%ecx
+ lsr w8, w1, #1 // shr $1,%r8d
+ and x8, x8, #32 // and $32,%r8d
+ eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _vpaes_cbc_encrypt
+.private_extern _vpaes_cbc_encrypt
+
+.align 4
+_vpaes_cbc_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2, Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+
+ ld1 {v0.16b}, [x4] // load ivec
+ bl _vpaes_encrypt_preheat
+ b Lcbc_enc_loop
+
+.align 4
+Lcbc_enc_loop:
+ ld1 {v7.16b}, [x0],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1],#16 // save output
+ subs x17, x17, #16
+ b.hi Lcbc_enc_loop
+
+ st1 {v0.16b}, [x4] // write ivec
+
+ ldp x29,x30,[sp],#16
+Lcbc_abort:
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+
+.align 4
+vpaes_cbc_decrypt:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+ ld1 {v6.16b}, [x4] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [x0], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #16
+ b.ls Lcbc_dec_done
+
+.align 4
+Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [x0], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #32
+ b.hi Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+ st1 {v6.16b}, [x4]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.globl _vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+
+.align 4
+_vpaes_ctr32_encrypt_blocks:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz x2, Lctr32_done
+
+ // Note, unlike the other functions, x2 here is measured in blocks,
+ // not bytes.
+ mov x17, x2
+ mov x2, x3
+
+ // Load the IV and counter portion.
+ ldr w6, [x4, #12]
+ ld1 {v7.16b}, [x4]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev w6, w6 // The counter is big-endian.
+ b.eq Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [x0], #16 // Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add w6, w6, #1
+ rev w7, w6
+ mov v7.s[3], w7
+ b.ls Lctr32_done
+
+Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add w6, w6, #1
+ rev w7, w6
+ mov v15.s[3], w7
+
+Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add w7, w6, #1
+ add w6, w6, #2
+ rev w7, w7
+ mov v14.s[3], w7
+ rev w7, w6
+ mov v15.s[3], w7
+ b.hi Lctr32_loop
+
+Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/test/trampoline-armv8.S b/apple-aarch64/crypto/test/trampoline-armv8.S
new file mode 100644
index 0000000..325da9b
--- /dev/null
+++ b/apple-aarch64/crypto/test/trampoline-armv8.S
@@ -0,0 +1,758 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+// const uint64_t *argv, size_t argc,
+// uint64_t unwind);
+
+.globl _abi_test_trampoline
+.private_extern _abi_test_trampoline
+.align 4
+_abi_test_trampoline:
+Labi_test_trampoline_begin:
+ AARCH64_SIGN_LINK_REGISTER
+ // Stack layout (low to high addresses)
+ // x29,x30 (16 bytes)
+ // d8-d15 (64 bytes)
+ // x19-x28 (80 bytes)
+ // x1 (8 bytes)
+ // padding (8 bytes)
+ stp x29, x30, [sp, #-176]!
+ mov x29, sp
+
+ // Saved callee-saved registers and |state|.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ stp x23, x24, [sp, #112]
+ stp x25, x26, [sp, #128]
+ stp x27, x28, [sp, #144]
+ str x1, [sp, #160]
+
+ // Load registers from |state|, with the exception of x29. x29 is the
+ // frame pointer and also callee-saved, but AAPCS64 allows platforms to
+ // mandate that x29 always point to a frame. iOS64 does so, which means
+ // we cannot fill x29 with entropy without violating ABI rules
+ // ourselves. x29 is tested separately below.
+ ldp d8, d9, [x1], #16
+ ldp d10, d11, [x1], #16
+ ldp d12, d13, [x1], #16
+ ldp d14, d15, [x1], #16
+ ldp x19, x20, [x1], #16
+ ldp x21, x22, [x1], #16
+ ldp x23, x24, [x1], #16
+ ldp x25, x26, [x1], #16
+ ldp x27, x28, [x1], #16
+
+ // Move parameters into temporary registers.
+ mov x9, x0
+ mov x10, x2
+ mov x11, x3
+
+ // Load parameters into registers.
+ cbz x11, Largs_done
+ ldr x0, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x1, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x2, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x3, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x4, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x5, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x6, [x10], #8
+ subs x11, x11, #1
+ b.eq Largs_done
+ ldr x7, [x10], #8
+
+Largs_done:
+ blr x9
+
+ // Reload |state| and store registers.
+ ldr x1, [sp, #160]
+ stp d8, d9, [x1], #16
+ stp d10, d11, [x1], #16
+ stp d12, d13, [x1], #16
+ stp d14, d15, [x1], #16
+ stp x19, x20, [x1], #16
+ stp x21, x22, [x1], #16
+ stp x23, x24, [x1], #16
+ stp x25, x26, [x1], #16
+ stp x27, x28, [x1], #16
+
+ // |func| is required to preserve x29, the frame pointer. We cannot load
+ // random values into x29 (see comment above), so compare it against the
+ // expected value and zero the field of |state| if corrupted.
+ mov x9, sp
+ cmp x29, x9
+ b.eq Lx29_ok
+ str xzr, [x1]
+
+Lx29_ok:
+ // Restore callee-saved registers.
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldp x23, x24, [sp, #112]
+ ldp x25, x26, [sp, #128]
+ ldp x27, x28, [sp, #144]
+
+ ldp x29, x30, [sp], #176
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+
+.globl _abi_test_clobber_x0
+.private_extern _abi_test_clobber_x0
+.align 4
+_abi_test_clobber_x0:
+ AARCH64_VALID_CALL_TARGET
+ mov x0, xzr
+ ret
+
+
+.globl _abi_test_clobber_x1
+.private_extern _abi_test_clobber_x1
+.align 4
+_abi_test_clobber_x1:
+ AARCH64_VALID_CALL_TARGET
+ mov x1, xzr
+ ret
+
+
+.globl _abi_test_clobber_x2
+.private_extern _abi_test_clobber_x2
+.align 4
+_abi_test_clobber_x2:
+ AARCH64_VALID_CALL_TARGET
+ mov x2, xzr
+ ret
+
+
+.globl _abi_test_clobber_x3
+.private_extern _abi_test_clobber_x3
+.align 4
+_abi_test_clobber_x3:
+ AARCH64_VALID_CALL_TARGET
+ mov x3, xzr
+ ret
+
+
+.globl _abi_test_clobber_x4
+.private_extern _abi_test_clobber_x4
+.align 4
+_abi_test_clobber_x4:
+ AARCH64_VALID_CALL_TARGET
+ mov x4, xzr
+ ret
+
+
+.globl _abi_test_clobber_x5
+.private_extern _abi_test_clobber_x5
+.align 4
+_abi_test_clobber_x5:
+ AARCH64_VALID_CALL_TARGET
+ mov x5, xzr
+ ret
+
+
+.globl _abi_test_clobber_x6
+.private_extern _abi_test_clobber_x6
+.align 4
+_abi_test_clobber_x6:
+ AARCH64_VALID_CALL_TARGET
+ mov x6, xzr
+ ret
+
+
+.globl _abi_test_clobber_x7
+.private_extern _abi_test_clobber_x7
+.align 4
+_abi_test_clobber_x7:
+ AARCH64_VALID_CALL_TARGET
+ mov x7, xzr
+ ret
+
+
+.globl _abi_test_clobber_x8
+.private_extern _abi_test_clobber_x8
+.align 4
+_abi_test_clobber_x8:
+ AARCH64_VALID_CALL_TARGET
+ mov x8, xzr
+ ret
+
+
+.globl _abi_test_clobber_x9
+.private_extern _abi_test_clobber_x9
+.align 4
+_abi_test_clobber_x9:
+ AARCH64_VALID_CALL_TARGET
+ mov x9, xzr
+ ret
+
+
+.globl _abi_test_clobber_x10
+.private_extern _abi_test_clobber_x10
+.align 4
+_abi_test_clobber_x10:
+ AARCH64_VALID_CALL_TARGET
+ mov x10, xzr
+ ret
+
+
+.globl _abi_test_clobber_x11
+.private_extern _abi_test_clobber_x11
+.align 4
+_abi_test_clobber_x11:
+ AARCH64_VALID_CALL_TARGET
+ mov x11, xzr
+ ret
+
+
+.globl _abi_test_clobber_x12
+.private_extern _abi_test_clobber_x12
+.align 4
+_abi_test_clobber_x12:
+ AARCH64_VALID_CALL_TARGET
+ mov x12, xzr
+ ret
+
+
+.globl _abi_test_clobber_x13
+.private_extern _abi_test_clobber_x13
+.align 4
+_abi_test_clobber_x13:
+ AARCH64_VALID_CALL_TARGET
+ mov x13, xzr
+ ret
+
+
+.globl _abi_test_clobber_x14
+.private_extern _abi_test_clobber_x14
+.align 4
+_abi_test_clobber_x14:
+ AARCH64_VALID_CALL_TARGET
+ mov x14, xzr
+ ret
+
+
+.globl _abi_test_clobber_x15
+.private_extern _abi_test_clobber_x15
+.align 4
+_abi_test_clobber_x15:
+ AARCH64_VALID_CALL_TARGET
+ mov x15, xzr
+ ret
+
+
+.globl _abi_test_clobber_x16
+.private_extern _abi_test_clobber_x16
+.align 4
+_abi_test_clobber_x16:
+ AARCH64_VALID_CALL_TARGET
+ mov x16, xzr
+ ret
+
+
+.globl _abi_test_clobber_x17
+.private_extern _abi_test_clobber_x17
+.align 4
+_abi_test_clobber_x17:
+ AARCH64_VALID_CALL_TARGET
+ mov x17, xzr
+ ret
+
+
+.globl _abi_test_clobber_x19
+.private_extern _abi_test_clobber_x19
+.align 4
+_abi_test_clobber_x19:
+ AARCH64_VALID_CALL_TARGET
+ mov x19, xzr
+ ret
+
+
+.globl _abi_test_clobber_x20
+.private_extern _abi_test_clobber_x20
+.align 4
+_abi_test_clobber_x20:
+ AARCH64_VALID_CALL_TARGET
+ mov x20, xzr
+ ret
+
+
+.globl _abi_test_clobber_x21
+.private_extern _abi_test_clobber_x21
+.align 4
+_abi_test_clobber_x21:
+ AARCH64_VALID_CALL_TARGET
+ mov x21, xzr
+ ret
+
+
+.globl _abi_test_clobber_x22
+.private_extern _abi_test_clobber_x22
+.align 4
+_abi_test_clobber_x22:
+ AARCH64_VALID_CALL_TARGET
+ mov x22, xzr
+ ret
+
+
+.globl _abi_test_clobber_x23
+.private_extern _abi_test_clobber_x23
+.align 4
+_abi_test_clobber_x23:
+ AARCH64_VALID_CALL_TARGET
+ mov x23, xzr
+ ret
+
+
+.globl _abi_test_clobber_x24
+.private_extern _abi_test_clobber_x24
+.align 4
+_abi_test_clobber_x24:
+ AARCH64_VALID_CALL_TARGET
+ mov x24, xzr
+ ret
+
+
+.globl _abi_test_clobber_x25
+.private_extern _abi_test_clobber_x25
+.align 4
+_abi_test_clobber_x25:
+ AARCH64_VALID_CALL_TARGET
+ mov x25, xzr
+ ret
+
+
+.globl _abi_test_clobber_x26
+.private_extern _abi_test_clobber_x26
+.align 4
+_abi_test_clobber_x26:
+ AARCH64_VALID_CALL_TARGET
+ mov x26, xzr
+ ret
+
+
+.globl _abi_test_clobber_x27
+.private_extern _abi_test_clobber_x27
+.align 4
+_abi_test_clobber_x27:
+ AARCH64_VALID_CALL_TARGET
+ mov x27, xzr
+ ret
+
+
+.globl _abi_test_clobber_x28
+.private_extern _abi_test_clobber_x28
+.align 4
+_abi_test_clobber_x28:
+ AARCH64_VALID_CALL_TARGET
+ mov x28, xzr
+ ret
+
+
+.globl _abi_test_clobber_x29
+.private_extern _abi_test_clobber_x29
+.align 4
+_abi_test_clobber_x29:
+ AARCH64_VALID_CALL_TARGET
+ mov x29, xzr
+ ret
+
+
+.globl _abi_test_clobber_d0
+.private_extern _abi_test_clobber_d0
+.align 4
+_abi_test_clobber_d0:
+ AARCH64_VALID_CALL_TARGET
+ fmov d0, xzr
+ ret
+
+
+.globl _abi_test_clobber_d1
+.private_extern _abi_test_clobber_d1
+.align 4
+_abi_test_clobber_d1:
+ AARCH64_VALID_CALL_TARGET
+ fmov d1, xzr
+ ret
+
+
+.globl _abi_test_clobber_d2
+.private_extern _abi_test_clobber_d2
+.align 4
+_abi_test_clobber_d2:
+ AARCH64_VALID_CALL_TARGET
+ fmov d2, xzr
+ ret
+
+
+.globl _abi_test_clobber_d3
+.private_extern _abi_test_clobber_d3
+.align 4
+_abi_test_clobber_d3:
+ AARCH64_VALID_CALL_TARGET
+ fmov d3, xzr
+ ret
+
+
+.globl _abi_test_clobber_d4
+.private_extern _abi_test_clobber_d4
+.align 4
+_abi_test_clobber_d4:
+ AARCH64_VALID_CALL_TARGET
+ fmov d4, xzr
+ ret
+
+
+.globl _abi_test_clobber_d5
+.private_extern _abi_test_clobber_d5
+.align 4
+_abi_test_clobber_d5:
+ AARCH64_VALID_CALL_TARGET
+ fmov d5, xzr
+ ret
+
+
+.globl _abi_test_clobber_d6
+.private_extern _abi_test_clobber_d6
+.align 4
+_abi_test_clobber_d6:
+ AARCH64_VALID_CALL_TARGET
+ fmov d6, xzr
+ ret
+
+
+.globl _abi_test_clobber_d7
+.private_extern _abi_test_clobber_d7
+.align 4
+_abi_test_clobber_d7:
+ AARCH64_VALID_CALL_TARGET
+ fmov d7, xzr
+ ret
+
+
+.globl _abi_test_clobber_d8
+.private_extern _abi_test_clobber_d8
+.align 4
+_abi_test_clobber_d8:
+ AARCH64_VALID_CALL_TARGET
+ fmov d8, xzr
+ ret
+
+
+.globl _abi_test_clobber_d9
+.private_extern _abi_test_clobber_d9
+.align 4
+_abi_test_clobber_d9:
+ AARCH64_VALID_CALL_TARGET
+ fmov d9, xzr
+ ret
+
+
+.globl _abi_test_clobber_d10
+.private_extern _abi_test_clobber_d10
+.align 4
+_abi_test_clobber_d10:
+ AARCH64_VALID_CALL_TARGET
+ fmov d10, xzr
+ ret
+
+
+.globl _abi_test_clobber_d11
+.private_extern _abi_test_clobber_d11
+.align 4
+_abi_test_clobber_d11:
+ AARCH64_VALID_CALL_TARGET
+ fmov d11, xzr
+ ret
+
+
+.globl _abi_test_clobber_d12
+.private_extern _abi_test_clobber_d12
+.align 4
+_abi_test_clobber_d12:
+ AARCH64_VALID_CALL_TARGET
+ fmov d12, xzr
+ ret
+
+
+.globl _abi_test_clobber_d13
+.private_extern _abi_test_clobber_d13
+.align 4
+_abi_test_clobber_d13:
+ AARCH64_VALID_CALL_TARGET
+ fmov d13, xzr
+ ret
+
+
+.globl _abi_test_clobber_d14
+.private_extern _abi_test_clobber_d14
+.align 4
+_abi_test_clobber_d14:
+ AARCH64_VALID_CALL_TARGET
+ fmov d14, xzr
+ ret
+
+
+.globl _abi_test_clobber_d15
+.private_extern _abi_test_clobber_d15
+.align 4
+_abi_test_clobber_d15:
+ AARCH64_VALID_CALL_TARGET
+ fmov d15, xzr
+ ret
+
+
+.globl _abi_test_clobber_d16
+.private_extern _abi_test_clobber_d16
+.align 4
+_abi_test_clobber_d16:
+ AARCH64_VALID_CALL_TARGET
+ fmov d16, xzr
+ ret
+
+
+.globl _abi_test_clobber_d17
+.private_extern _abi_test_clobber_d17
+.align 4
+_abi_test_clobber_d17:
+ AARCH64_VALID_CALL_TARGET
+ fmov d17, xzr
+ ret
+
+
+.globl _abi_test_clobber_d18
+.private_extern _abi_test_clobber_d18
+.align 4
+_abi_test_clobber_d18:
+ AARCH64_VALID_CALL_TARGET
+ fmov d18, xzr
+ ret
+
+
+.globl _abi_test_clobber_d19
+.private_extern _abi_test_clobber_d19
+.align 4
+_abi_test_clobber_d19:
+ AARCH64_VALID_CALL_TARGET
+ fmov d19, xzr
+ ret
+
+
+.globl _abi_test_clobber_d20
+.private_extern _abi_test_clobber_d20
+.align 4
+_abi_test_clobber_d20:
+ AARCH64_VALID_CALL_TARGET
+ fmov d20, xzr
+ ret
+
+
+.globl _abi_test_clobber_d21
+.private_extern _abi_test_clobber_d21
+.align 4
+_abi_test_clobber_d21:
+ AARCH64_VALID_CALL_TARGET
+ fmov d21, xzr
+ ret
+
+
+.globl _abi_test_clobber_d22
+.private_extern _abi_test_clobber_d22
+.align 4
+_abi_test_clobber_d22:
+ AARCH64_VALID_CALL_TARGET
+ fmov d22, xzr
+ ret
+
+
+.globl _abi_test_clobber_d23
+.private_extern _abi_test_clobber_d23
+.align 4
+_abi_test_clobber_d23:
+ AARCH64_VALID_CALL_TARGET
+ fmov d23, xzr
+ ret
+
+
+.globl _abi_test_clobber_d24
+.private_extern _abi_test_clobber_d24
+.align 4
+_abi_test_clobber_d24:
+ AARCH64_VALID_CALL_TARGET
+ fmov d24, xzr
+ ret
+
+
+.globl _abi_test_clobber_d25
+.private_extern _abi_test_clobber_d25
+.align 4
+_abi_test_clobber_d25:
+ AARCH64_VALID_CALL_TARGET
+ fmov d25, xzr
+ ret
+
+
+.globl _abi_test_clobber_d26
+.private_extern _abi_test_clobber_d26
+.align 4
+_abi_test_clobber_d26:
+ AARCH64_VALID_CALL_TARGET
+ fmov d26, xzr
+ ret
+
+
+.globl _abi_test_clobber_d27
+.private_extern _abi_test_clobber_d27
+.align 4
+_abi_test_clobber_d27:
+ AARCH64_VALID_CALL_TARGET
+ fmov d27, xzr
+ ret
+
+
+.globl _abi_test_clobber_d28
+.private_extern _abi_test_clobber_d28
+.align 4
+_abi_test_clobber_d28:
+ AARCH64_VALID_CALL_TARGET
+ fmov d28, xzr
+ ret
+
+
+.globl _abi_test_clobber_d29
+.private_extern _abi_test_clobber_d29
+.align 4
+_abi_test_clobber_d29:
+ AARCH64_VALID_CALL_TARGET
+ fmov d29, xzr
+ ret
+
+
+.globl _abi_test_clobber_d30
+.private_extern _abi_test_clobber_d30
+.align 4
+_abi_test_clobber_d30:
+ AARCH64_VALID_CALL_TARGET
+ fmov d30, xzr
+ ret
+
+
+.globl _abi_test_clobber_d31
+.private_extern _abi_test_clobber_d31
+.align 4
+_abi_test_clobber_d31:
+ AARCH64_VALID_CALL_TARGET
+ fmov d31, xzr
+ ret
+
+
+.globl _abi_test_clobber_v8_upper
+.private_extern _abi_test_clobber_v8_upper
+.align 4
+_abi_test_clobber_v8_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v8.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v9_upper
+.private_extern _abi_test_clobber_v9_upper
+.align 4
+_abi_test_clobber_v9_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v9.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v10_upper
+.private_extern _abi_test_clobber_v10_upper
+.align 4
+_abi_test_clobber_v10_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v10.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v11_upper
+.private_extern _abi_test_clobber_v11_upper
+.align 4
+_abi_test_clobber_v11_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v11.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v12_upper
+.private_extern _abi_test_clobber_v12_upper
+.align 4
+_abi_test_clobber_v12_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v12.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v13_upper
+.private_extern _abi_test_clobber_v13_upper
+.align 4
+_abi_test_clobber_v13_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v13.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v14_upper
+.private_extern _abi_test_clobber_v14_upper
+.align 4
+_abi_test_clobber_v14_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v14.d[1], xzr
+ ret
+
+
+.globl _abi_test_clobber_v15_upper
+.private_extern _abi_test_clobber_v15_upper
+.align 4
+_abi_test_clobber_v15_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v15.d[1], xzr
+ ret
+
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/chacha/chacha-armv4.S b/apple-arm/crypto/chacha/chacha-armv4.S
new file mode 100644
index 0000000..cadf2b6
--- /dev/null
+++ b/apple-arm/crypto/chacha/chacha-armv4.S
@@ -0,0 +1,1498 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+Lone:
+.long 1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+LOPENSSL_armcap:
+.word OPENSSL_armcap_P-LChaCha20_ctr32
+#else
+.word -1
+#endif
+
+.globl _ChaCha20_ctr32
+.private_extern _ChaCha20_ctr32
+#ifdef __thumb2__
+.thumb_func _ChaCha20_ctr32
+#endif
+.align 5
+_ChaCha20_ctr32:
+LChaCha20_ctr32:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r14,pc,#16 @ _ChaCha20_ctr32
+#else
+ adr r14,LChaCha20_ctr32
+#endif
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq Lno_data
+#if __ARM_MAX_ARCH__>=7
+ cmp r2,#192 @ test len
+ bls Lshort
+ ldr r4,[r14,#-32]
+ ldr r4,[r14,r4]
+# ifdef __APPLE__
+ ldr r4,[r4]
+# endif
+ tst r4,#ARMV7_NEON
+ bne LChaCha20_neon
+Lshort:
+#endif
+ ldmia r12,{r4,r5,r6,r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ sub r14,r14,#64 @ Lsigma
+ stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key
+ stmdb sp!,{r0,r1,r2,r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b Loop_outer_enter
+
+.align 4
+Loop_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b Loop
+
+.align 4
+Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne Lunaligned
+ cmp r11,#64 @ restore flags
+# else
+ ldr r10,[sp,#4*(2)]
+# endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi Loop_outer
+
+ beq Ldone
+# if __ARM_ARCH__<7
+ b Ltail
+
+.align 4
+Lunaligned:@ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ bhi Loop_outer
+
+ beq Ldone
+#endif
+
+Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne Loop_tail
+
+Ldone:
+ add sp,sp,#4*(32+3)
+Lno_data:
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func ChaCha20_neon
+#endif
+.align 5
+ChaCha20_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+LChaCha20_neon:
+ adr r14,Lsigma
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so
+ stmdb sp!,{r0,r1,r2,r3}
+
+ vld1.32 {q1,q2},[r3] @ load key
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0,q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b Loop_neon_enter
+
+.align 4
+Loop_neon_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b Loop_neon
+
+.align 4
+Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12,q13},[sp] @ load key material
+ vld1.32 {q14,q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo Ltail_neon
+
+ vld1.8 {q12,q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0,q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0,q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2,q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4,q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6,q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8,q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10,q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi Loop_neon_outer
+
+ b Ldone_neon
+
+.align 4
+Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0,q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2,q3},[r11]
+ mov r11,#10
+ b Loop @ go integer-only
+
+.align 4
+Ltail_neon:
+ cmp r11,#64*3
+ bhs L192_or_more_neon
+ cmp r11,#64*2
+ bhs L128_or_more_neon
+ cmp r11,#64*1
+ bhs L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0,q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2,q3},[r8]
+ b Loop_tail_neon
+
+.align 4
+L64_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0,q1},[r14]!
+ vst1.8 {q2,q3},[r14]!
+
+ beq Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4,q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6,q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b Loop_tail_neon
+
+.align 4
+L128_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0,q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4,q5},[r14]!
+ vst1.8 {q6,q7},[r14]!
+
+ beq Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8,q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10,q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b Loop_tail_neon
+
+.align 4
+L192_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0,q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2,q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4,q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6,q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8,q9},[r14]!
+ vst1.8 {q10,q11},[r14]!
+
+ beq Ldone_neon
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne Loop_tail_neon
+
+Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+.comm _OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol _OPENSSL_armcap_P
+.long 0
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/aesv8-armx32.S b/apple-arm/crypto/fipsmodule/aesv8-armx32.S
new file mode 100644
index 0000000..87b4b0a
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/aesv8-armx32.S
@@ -0,0 +1,809 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+
+.code 32
+#undef __thumb2__
+.align 5
+Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl _aes_hw_set_encrypt_key
+.private_extern _aes_hw_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func _aes_hw_set_encrypt_key
+#endif
+.align 5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+ mov r3,#-1
+ cmp r0,#0
+ beq Lenc_key_abort
+ cmp r2,#0
+ beq Lenc_key_abort
+ mov r3,#-2
+ cmp r1,#128
+ blt Lenc_key_abort
+ cmp r1,#256
+ bgt Lenc_key_abort
+ tst r1,#0x3f
+ bne Lenc_key_abort
+
+ adr r3,Lrcon
+ cmp r1,#192
+
+ veor q0,q0,q0
+ vld1.8 {q3},[r0]!
+ mov r1,#8 @ reuse r1
+ vld1.32 {q1,q2},[r3]!
+
+ blt Loop128
+ beq L192
+ b L256
+
+.align 4
+Loop128:
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ bne Loop128
+
+ vld1.32 {q1},[r3]
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]
+ add r2,r2,#0x50
+
+ mov r12,#10
+ b Ldone
+
+.align 4
+L192:
+ vld1.8 {d16},[r0]!
+ vmov.i8 q10,#8 @ borrow q10
+ vst1.32 {q3},[r2]!
+ vsub.i8 q2,q2,q10 @ adjust the mask
+
+Loop192:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {d16},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+
+ vdup.32 q9,d7[1]
+ veor q9,q9,q8
+ veor q10,q10,q1
+ vext.8 q8,q0,q8,#12
+ vshl.u8 q1,q1,#1
+ veor q8,q8,q9
+ veor q3,q3,q10
+ veor q8,q8,q10
+ vst1.32 {q3},[r2]!
+ bne Loop192
+
+ mov r12,#12
+ add r2,r2,#0x20
+ b Ldone
+
+.align 4
+L256:
+ vld1.8 {q8},[r0]
+ mov r1,#7
+ mov r12,#14
+ vst1.32 {q3},[r2]!
+
+Loop256:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q8},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]!
+ beq Ldone
+
+ vdup.32 q10,d7[1]
+ vext.8 q9,q0,q8,#12
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+
+ veor q8,q8,q10
+ b Loop256
+
+Ldone:
+ str r12,[r2]
+ mov r3,#0
+
+Lenc_key_abort:
+ mov r0,r3 @ return value
+
+ bx lr
+
+
+.globl _aes_hw_set_decrypt_key
+.private_extern _aes_hw_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func _aes_hw_set_decrypt_key
+#endif
+.align 5
+_aes_hw_set_decrypt_key:
+ stmdb sp!,{r4,lr}
+ bl Lenc_key
+
+ cmp r0,#0
+ bne Ldec_key_abort
+
+ sub r2,r2,#240 @ restore original r2
+ mov r4,#-16
+ add r0,r2,r12,lsl#4 @ end of key schedule
+
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+
+Loop_imc:
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+ cmp r0,r2
+ bhi Loop_imc
+
+ vld1.32 {q0},[r2]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vst1.32 {q0},[r0]
+
+ eor r0,r0,r0 @ return value
+Ldec_key_abort:
+ ldmia sp!,{r4,pc}
+
+.globl _aes_hw_encrypt
+.private_extern _aes_hw_encrypt
+#ifdef __thumb2__
+.thumb_func _aes_hw_encrypt
+#endif
+.align 5
+_aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+Loop_enc:
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt Loop_enc
+
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+
+.globl _aes_hw_decrypt
+.private_extern _aes_hw_decrypt
+#ifdef __thumb2__
+.thumb_func _aes_hw_decrypt
+#endif
+.align 5
+_aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+Loop_dec:
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt Loop_dec
+
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+
+.globl _aes_hw_cbc_encrypt
+.private_extern _aes_hw_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func _aes_hw_cbc_encrypt
+#endif
+.align 5
+_aes_hw_cbc_encrypt:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load remaining args
+ subs r2,r2,#16
+ mov r8,#16
+ blo Lcbc_abort
+ moveq r8,#0
+
+ cmp r5,#0 @ en- or decrypting?
+ ldr r5,[r3,#240]
+ and r2,r2,#-16
+ vld1.8 {q6},[r4]
+ vld1.8 {q0},[r0],r8
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#6
+ add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
+ sub r5,r5,#2
+ vld1.32 {q10,q11},[r7]!
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+
+ add r7,r3,#32
+ mov r6,r5
+ beq Lcbc_dec
+
+ cmp r5,#2
+ veor q0,q0,q6
+ veor q5,q8,q7
+ beq Lcbc_enc128
+
+ vld1.32 {q2,q3},[r7]
+ add r7,r3,#16
+ add r6,r3,#16*4
+ add r12,r3,#16*5
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r14,r3,#16*6
+ add r3,r3,#16*7
+ b Lenter_cbc_enc
+
+.align 4
+Loop_cbc_enc:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+Lenter_cbc_enc:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r6]
+ cmp r5,#4
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r12]
+ beq Lcbc_enc192
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r14]
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r3]
+ nop
+
+Lcbc_enc192:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs Loop_cbc_enc
+
+ vst1.8 {q6},[r1]!
+ b Lcbc_done
+
+.align 5
+Lcbc_enc128:
+ vld1.32 {q2,q3},[r7]
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ b Lenter_cbc_enc128
+Loop_cbc_enc128:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+Lenter_cbc_enc128:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs Loop_cbc_enc128
+
+ vst1.8 {q6},[r1]!
+ b Lcbc_done
+.align 5
+Lcbc_dec:
+ vld1.8 {q10},[r0]!
+ subs r2,r2,#32 @ bias
+ add r6,r5,#2
+ vorr q3,q0,q0
+ vorr q1,q0,q0
+ vorr q11,q10,q10
+ blo Lcbc_dec_tail
+
+ vorr q1,q10,q10
+ vld1.8 {q10},[r0]!
+ vorr q2,q0,q0
+ vorr q3,q1,q1
+ vorr q11,q10,q10
+
+Loop3x_cbc_dec:
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt Loop3x_cbc_dec
+
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q4,q6,q7
+ subs r2,r2,#0x30
+ veor q5,q2,q7
+ movlo r6,r2 @ r6, r6, is zero at this point
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+ add r0,r0,r6 @ r0 is adjusted in such way that
+ @ at exit from the loop q1-q10
+ @ are loaded with last "words"
+ vorr q6,q11,q11
+ mov r7,r3
+.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q2},[r0]!
+.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q3},[r0]!
+.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q11},[r0]!
+.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ add r6,r5,#2
+ veor q4,q4,q0
+ veor q5,q5,q1
+ veor q10,q10,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q4},[r1]!
+ vorr q0,q2,q2
+ vst1.8 {q5},[r1]!
+ vorr q1,q3,q3
+ vst1.8 {q10},[r1]!
+ vorr q10,q11,q11
+ bhs Loop3x_cbc_dec
+
+ cmn r2,#0x30
+ beq Lcbc_done
+ nop
+
+Lcbc_dec_tail:
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt Lcbc_dec_tail
+
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ cmn r2,#0x20
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q5,q6,q7
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ beq Lcbc_dec_one
+ veor q5,q5,q1
+ veor q9,q9,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+ vst1.8 {q9},[r1]!
+ b Lcbc_done
+
+Lcbc_dec_one:
+ veor q5,q5,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+
+Lcbc_done:
+ vst1.8 {q6},[r4]
+Lcbc_abort:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,pc}
+
+.globl _aes_hw_ctr32_encrypt_blocks
+.private_extern _aes_hw_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func _aes_hw_ctr32_encrypt_blocks
+#endif
+.align 5
+_aes_hw_ctr32_encrypt_blocks:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+ ldr r5,[r3,#240]
+
+ ldr r8, [r4, #12]
+ vld1.32 {q0},[r4]
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#4
+ mov r12,#16
+ cmp r2,#2
+ add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
+ sub r5,r5,#2
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+ add r7,r3,#32
+ mov r6,r5
+ movlo r12,#0
+
+ @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ @ affected by silicon errata #1742098 [0] and #1655431 [1],
+ @ respectively, where the second instruction of an aese/aesmc
+ @ instruction pair may execute twice if an interrupt is taken right
+ @ after the first instruction consumes an input register of which a
+ @ single 32-bit lane has been updated the last time it was modified.
+ @
+ @ This function uses a counter in one 32-bit lane. The
+ @ could write to q1 and q10 directly, but that trips this bugs.
+ @ We write to q6 and copy to the final register as a workaround.
+ @
+ @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+ rev r8, r8
+#endif
+ add r10, r8, #1
+ vorr q6,q0,q0
+ rev r10, r10
+ vmov.32 d13[1],r10
+ add r8, r8, #2
+ vorr q1,q6,q6
+ bls Lctr32_tail
+ rev r12, r8
+ vmov.32 d13[1],r12
+ sub r2,r2,#3 @ bias
+ vorr q10,q6,q6
+ b Loop3x_ctr32
+
+.align 4
+Loop3x_ctr32:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt Loop3x_ctr32
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
+ vld1.8 {q2},[r0]!
+ add r9,r8,#1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.8 {q3},[r0]!
+ rev r9,r9
+.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.8 {q11},[r0]!
+ mov r7,r3
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
+.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ veor q2,q2,q7
+ add r10,r8,#2
+.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ veor q3,q3,q7
+ add r8,r8,#3
+.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ @ Note the logic to update q0, q1, and q1 is written to work
+ @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ @ 32-bit mode. See the comment above.
+ veor q11,q11,q7
+ vmov.32 d13[1], r9
+.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q0,q6,q6
+ rev r10,r10
+.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ vmov.32 d13[1], r10
+ rev r12,r8
+.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vorr q1,q6,q6
+ vmov.32 d13[1], r12
+.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q10,q6,q6
+ subs r2,r2,#3
+.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
+.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
+.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
+
+ veor q2,q2,q4
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vst1.8 {q2},[r1]!
+ veor q3,q3,q5
+ mov r6,r5
+ vst1.8 {q3},[r1]!
+ veor q11,q11,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q11},[r1]!
+ bhs Loop3x_ctr32
+
+ adds r2,r2,#3
+ beq Lctr32_done
+ cmp r2,#1
+ mov r12,#16
+ moveq r12,#0
+
+Lctr32_tail:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q9},[r7]!
+ bgt Lctr32_tail
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q2},[r0],r12
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q2,q2,q7
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q3,q3,q7
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
+
+ cmp r2,#1
+ veor q2,q2,q0
+ veor q3,q3,q1
+ vst1.8 {q2},[r1]!
+ beq Lctr32_done
+ vst1.8 {q3},[r1]
+
+Lctr32_done:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/armv4-mont.S b/apple-arm/crypto/fipsmodule/armv4-mont.S
new file mode 100644
index 0000000..e549d1f
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/armv4-mont.S
@@ -0,0 +1,982 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align 5
+LOPENSSL_armcap:
+.word OPENSSL_armcap_P-Lbn_mul_mont
+#endif
+
+.globl _bn_mul_mont
+.private_extern _bn_mul_mont
+#ifdef __thumb2__
+.thumb_func _bn_mul_mont
+#endif
+
+.align 5
+_bn_mul_mont:
+Lbn_mul_mont:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+ tst ip,#7
+ bne Lialu
+ adr r0,Lbn_mul_mont
+ ldr r2,LOPENSSL_armcap
+ ldr r0,[r0,r2]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+ tst r0,#ARMV7_NEON @ NEON available?
+ ldmia sp, {r0,r2}
+ beq Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+Lialu:
+#endif
+ cmp ip,#2
+ mov r0,ip @ load num
+#ifdef __thumb2__
+ ittt lt
+#endif
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt Labrt
+
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
+
+ mov r0,r0,lsl#2 @ rescale r0 for byte count
+ sub sp,sp,r0 @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub r0,r0,#4 @ "num=num-1"
+ add r4,r2,r0 @ &bp[num-1]
+
+ add r0,sp,r0 @ r0 to point at &tp[num-1]
+ ldr r8,[r0,#14*4] @ &n0
+ ldr r2,[r2] @ bp[0]
+ ldr r5,[r1],#4 @ ap[0],ap++
+ ldr r6,[r3],#4 @ np[0],np++
+ ldr r8,[r8] @ *n0
+ str r4,[r0,#15*4] @ save &bp[num]
+
+ umull r10,r11,r5,r2 @ ap[0]*bp[0]
+ str r8,[r0,#14*4] @ save n0 value
+ mul r8,r10,r8 @ "tp[0]"*n0
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
+ mov r4,sp
+
+L1st:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[0]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne L1st
+
+ adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
+ mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ mov r7,sp
+ str r14,[r0,#4] @ tp[num]=
+
+Louter:
+ sub r7,r0,r7 @ "original" r0-1 value
+ sub r1,r1,r7 @ "rewind" ap to &ap[1]
+ ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
+ ldr r5,[r1,#-4] @ ap[0]
+ ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
+ ldr r7,[sp,#4] @ tp[1]
+
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
+ str r4,[r0,#13*4] @ save bp
+ mul r8,r10,r8
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
+ mov r4,sp
+
+Linner:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[i]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne Linner
+
+ adds r12,r12,r11
+ mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
+ adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adds r12,r12,r7
+ ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ str r14,[r0,#4] @ tp[num]=
+
+ cmp r4,r7
+#ifdef __thumb2__
+ itt ne
+#endif
+ movne r7,sp
+ bne Louter
+
+ ldr r2,[r0,#12*4] @ pull rp
+ mov r5,sp
+ add r0,r0,#4 @ r0 to point at &tp[num]
+ sub r5,r0,r5 @ "original" num value
+ mov r4,sp @ "rewind" r4
+ mov r1,r4 @ "borrow" r1
+ sub r3,r3,r5 @ "rewind" r3 to &np[0]
+
+ subs r7,r7,r7 @ "clear" carry flag
+Lsub: ldr r7,[r4],#4
+ ldr r6,[r3],#4
+ sbcs r7,r7,r6 @ tp[j]-np[j]
+ str r7,[r2],#4 @ rp[j]=
+ teq r4,r0 @ preserve carry
+ bne Lsub
+ sbcs r14,r14,#0 @ upmost carry
+ mov r4,sp @ "rewind" r4
+ sub r2,r2,r5 @ "rewind" r2
+
+Lcopy: ldr r7,[r4] @ conditional copy
+ ldr r5,[r2]
+ str sp,[r4],#4 @ zap tp
+#ifdef __thumb2__
+ it cc
+#endif
+ movcc r5,r7
+ str r5,[r2],#4
+ teq r4,r0 @ preserve carry
+ bne Lcopy
+
+ mov sp,r0
+ add sp,sp,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+Labrt:
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func bn_mul8x_mont_neon
+#endif
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load rest of parameter block
+ mov ip,sp
+
+ cmp r5,#8
+ bhi LNEON_8n
+
+ @ special case for r5==8, everything is in register bank...
+
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ sub r7,sp,r5,lsl#4
+ vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
+ and r7,r7,#-64
+ vld1.32 {d30[0]}, [r4,:32]
+ mov sp,r7 @ alloca
+ vzip.16 d28,d8
+
+ vmull.u32 q6,d28,d0[0]
+ vmull.u32 q7,d28,d0[1]
+ vmull.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmull.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ vmul.u32 d29,d29,d30
+
+ vmull.u32 q10,d28,d2[0]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+ vmull.u32 q11,d28,d2[1]
+ vmull.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmull.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ sub r9,r5,#1
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ b LNEON_outer8
+
+.align 4
+LNEON_outer8:
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ vadd.u64 d12,d12,d10
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ subs r9,r9,#1
+ vmul.u32 d29,d29,d30
+
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ bne LNEON_outer8
+
+ vadd.u64 d12,d12,d10
+ mov r7,sp
+ vshr.u64 d10,d12,#16
+ mov r8,r5
+ vadd.u64 d13,d13,d10
+ add r6,sp,#96
+ vshr.u64 d10,d13,#16
+ vzip.16 d12,d13
+
+ b LNEON_tail_entry
+
+.align 4
+LNEON_8n:
+ veor q6,q6,q6
+ sub r7,sp,#128
+ veor q7,q7,q7
+ sub r7,r7,r5,lsl#4
+ veor q8,q8,q8
+ and r7,r7,#-64
+ veor q9,q9,q9
+ mov sp,r7 @ alloca
+ veor q10,q10,q10
+ add r7,r7,#256
+ veor q11,q11,q11
+ sub r8,r5,#8
+ veor q12,q12,q12
+ veor q13,q13,q13
+
+LNEON_8n_init:
+ vst1.64 {q6,q7},[r7,:256]!
+ subs r8,r8,#8
+ vst1.64 {q8,q9},[r7,:256]!
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12,q13},[r7,:256]!
+ bne LNEON_8n_init
+
+ add r6,sp,#256
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ add r10,sp,#8
+ vld1.32 {d30[0]},[r4,:32]
+ mov r9,r5
+ b LNEON_8n_outer
+
+.align 4
+LNEON_8n_outer:
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ add r7,sp,#128
+ vld1.32 {d4,d5,d6,d7},[r3]!
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+ vadd.u64 d29,d29,d12
+ vmlal.u32 q10,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q11,d28,d2[1]
+ vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q6,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q7,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q8,d29,d5[0]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vadd.u64 d12,d12,d13
+ vmlal.u32 q11,d29,d6[1]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vadd.u64 d14,d14,d12
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0]
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]!
+ vmlal.u32 q8,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q9,d28,d1[0]
+ vshl.i64 d29,d15,#16
+ vmlal.u32 q10,d28,d1[1]
+ vadd.u64 d29,d29,d14
+ vmlal.u32 q11,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q12,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1]
+ vmlal.u32 q13,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q7,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q8,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q9,d29,d5[0]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vadd.u64 d14,d14,d15
+ vmlal.u32 q12,d29,d6[1]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vadd.u64 d16,d16,d14
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1]
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]!
+ vmlal.u32 q9,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q10,d28,d1[0]
+ vshl.i64 d29,d17,#16
+ vmlal.u32 q11,d28,d1[1]
+ vadd.u64 d29,d29,d16
+ vmlal.u32 q12,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q13,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2]
+ vmlal.u32 q6,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q8,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q9,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q10,d29,d5[0]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vadd.u64 d16,d16,d17
+ vmlal.u32 q13,d29,d6[1]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vadd.u64 d18,d18,d16
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2]
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]!
+ vmlal.u32 q10,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q11,d28,d1[0]
+ vshl.i64 d29,d19,#16
+ vmlal.u32 q12,d28,d1[1]
+ vadd.u64 d29,d29,d18
+ vmlal.u32 q13,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q6,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3]
+ vmlal.u32 q7,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q9,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q10,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q11,d29,d5[0]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vadd.u64 d18,d18,d19
+ vmlal.u32 q6,d29,d6[1]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vadd.u64 d20,d20,d18
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3]
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]!
+ vmlal.u32 q11,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q12,d28,d1[0]
+ vshl.i64 d29,d21,#16
+ vmlal.u32 q13,d28,d1[1]
+ vadd.u64 d29,d29,d20
+ vmlal.u32 q6,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q7,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4]
+ vmlal.u32 q8,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q10,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q11,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q12,d29,d5[0]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vadd.u64 d20,d20,d21
+ vmlal.u32 q7,d29,d6[1]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vadd.u64 d22,d22,d20
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4]
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]!
+ vmlal.u32 q12,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q13,d28,d1[0]
+ vshl.i64 d29,d23,#16
+ vmlal.u32 q6,d28,d1[1]
+ vadd.u64 d29,d29,d22
+ vmlal.u32 q7,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q8,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5]
+ vmlal.u32 q9,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q11,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q12,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q13,d29,d5[0]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vadd.u64 d22,d22,d23
+ vmlal.u32 q8,d29,d6[1]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vadd.u64 d24,d24,d22
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5]
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]!
+ vmlal.u32 q13,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q6,d28,d1[0]
+ vshl.i64 d29,d25,#16
+ vmlal.u32 q7,d28,d1[1]
+ vadd.u64 d29,d29,d24
+ vmlal.u32 q8,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q9,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6]
+ vmlal.u32 q10,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q12,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q13,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q6,d29,d5[0]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vadd.u64 d24,d24,d25
+ vmlal.u32 q9,d29,d6[1]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vadd.u64 d26,d26,d24
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6]
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]!
+ vmlal.u32 q6,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q7,d28,d1[0]
+ vshl.i64 d29,d27,#16
+ vmlal.u32 q8,d28,d1[1]
+ vadd.u64 d29,d29,d26
+ vmlal.u32 q9,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q10,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7]
+ vmlal.u32 q11,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q12,d28,d3[1]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d29,d5[0]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vadd.u64 d26,d26,d27
+ vmlal.u32 q10,d29,d6[1]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q11,d29,d7[0]
+ vmlal.u32 q12,d29,d7[1]
+ vadd.u64 d12,d12,d26
+ vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7]
+ add r10,sp,#8 @ rewind
+ sub r8,r5,#8
+ b LNEON_8n_inner
+
+.align 4
+LNEON_8n_inner:
+ subs r8,r8,#8
+ vmlal.u32 q6,d28,d0[0]
+ vld1.64 {q13},[r6,:128]
+ vmlal.u32 q7,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0]
+ vmlal.u32 q8,d28,d1[0]
+ vld1.32 {d4,d5,d6,d7},[r3]!
+ vmlal.u32 q9,d28,d1[1]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1]
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vmlal.u32 q11,d29,d6[1]
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vst1.64 {q6},[r7,:128]!
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]
+ vmlal.u32 q8,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1]
+ vmlal.u32 q9,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d1[1]
+ vmlal.u32 q11,d28,d2[0]
+ vmlal.u32 q12,d28,d2[1]
+ vmlal.u32 q13,d28,d3[0]
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2]
+ vmlal.u32 q7,d29,d4[0]
+ vmlal.u32 q8,d29,d4[1]
+ vmlal.u32 q9,d29,d5[0]
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vmlal.u32 q12,d29,d6[1]
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vst1.64 {q7},[r7,:128]!
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]
+ vmlal.u32 q9,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2]
+ vmlal.u32 q10,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q11,d28,d1[1]
+ vmlal.u32 q12,d28,d2[0]
+ vmlal.u32 q13,d28,d2[1]
+ vmlal.u32 q6,d28,d3[0]
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3]
+ vmlal.u32 q8,d29,d4[0]
+ vmlal.u32 q9,d29,d4[1]
+ vmlal.u32 q10,d29,d5[0]
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vmlal.u32 q13,d29,d6[1]
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vst1.64 {q8},[r7,:128]!
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]
+ vmlal.u32 q10,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3]
+ vmlal.u32 q11,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q12,d28,d1[1]
+ vmlal.u32 q13,d28,d2[0]
+ vmlal.u32 q6,d28,d2[1]
+ vmlal.u32 q7,d28,d3[0]
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4]
+ vmlal.u32 q9,d29,d4[0]
+ vmlal.u32 q10,d29,d4[1]
+ vmlal.u32 q11,d29,d5[0]
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vmlal.u32 q6,d29,d6[1]
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vst1.64 {q9},[r7,:128]!
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]
+ vmlal.u32 q11,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4]
+ vmlal.u32 q12,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q13,d28,d1[1]
+ vmlal.u32 q6,d28,d2[0]
+ vmlal.u32 q7,d28,d2[1]
+ vmlal.u32 q8,d28,d3[0]
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5]
+ vmlal.u32 q10,d29,d4[0]
+ vmlal.u32 q11,d29,d4[1]
+ vmlal.u32 q12,d29,d5[0]
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vmlal.u32 q7,d29,d6[1]
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vst1.64 {q10},[r7,:128]!
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]
+ vmlal.u32 q12,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5]
+ vmlal.u32 q13,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q6,d28,d1[1]
+ vmlal.u32 q7,d28,d2[0]
+ vmlal.u32 q8,d28,d2[1]
+ vmlal.u32 q9,d28,d3[0]
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6]
+ vmlal.u32 q11,d29,d4[0]
+ vmlal.u32 q12,d29,d4[1]
+ vmlal.u32 q13,d29,d5[0]
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vmlal.u32 q8,d29,d6[1]
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vst1.64 {q11},[r7,:128]!
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]
+ vmlal.u32 q13,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6]
+ vmlal.u32 q6,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q7,d28,d1[1]
+ vmlal.u32 q8,d28,d2[0]
+ vmlal.u32 q9,d28,d2[1]
+ vmlal.u32 q10,d28,d3[0]
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7]
+ vmlal.u32 q12,d29,d4[0]
+ vmlal.u32 q13,d29,d4[1]
+ vmlal.u32 q6,d29,d5[0]
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vmlal.u32 q9,d29,d6[1]
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vst1.64 {q12},[r7,:128]!
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]
+ vmlal.u32 q6,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7]
+ vmlal.u32 q7,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q8,d28,d1[1]
+ vmlal.u32 q9,d28,d2[0]
+ vmlal.u32 q10,d28,d2[1]
+ vmlal.u32 q11,d28,d3[0]
+ vmlal.u32 q12,d28,d3[1]
+ it eq
+ subeq r1,r1,r5,lsl#2 @ rewind
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q6,d29,d4[1]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q7,d29,d5[0]
+ add r10,sp,#8 @ rewind
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vmlal.u32 q10,d29,d6[1]
+ vmlal.u32 q11,d29,d7[0]
+ vst1.64 {q13},[r7,:128]!
+ vmlal.u32 q12,d29,d7[1]
+
+ bne LNEON_8n_inner
+ add r6,sp,#128
+ vst1.64 {q6,q7},[r7,:256]!
+ veor q2,q2,q2 @ d4-d5
+ vst1.64 {q8,q9},[r7,:256]!
+ veor q3,q3,q3 @ d6-d7
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12},[r7,:128]
+
+ subs r9,r9,#8
+ vld1.64 {q6,q7},[r6,:256]!
+ vld1.64 {q8,q9},[r6,:256]!
+ vld1.64 {q10,q11},[r6,:256]!
+ vld1.64 {q12,q13},[r6,:256]!
+
+ itt ne
+ subne r3,r3,r5,lsl#2 @ rewind
+ bne LNEON_8n_outer
+
+ add r7,sp,#128
+ vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame
+ vshr.u64 d10,d12,#16
+ vst1.64 {q2,q3},[sp,:256]!
+ vadd.u64 d13,d13,d10
+ vst1.64 {q2,q3}, [sp,:256]!
+ vshr.u64 d10,d13,#16
+ vst1.64 {q2,q3}, [sp,:256]!
+ vzip.16 d12,d13
+
+ mov r8,r5
+ b LNEON_tail_entry
+
+.align 4
+LNEON_tail:
+ vadd.u64 d12,d12,d10
+ vshr.u64 d10,d12,#16
+ vld1.64 {q8,q9}, [r6, :256]!
+ vadd.u64 d13,d13,d10
+ vld1.64 {q10,q11}, [r6, :256]!
+ vshr.u64 d10,d13,#16
+ vld1.64 {q12,q13}, [r6, :256]!
+ vzip.16 d12,d13
+
+LNEON_tail_entry:
+ vadd.u64 d14,d14,d10
+ vst1.32 {d12[0]}, [r7, :32]!
+ vshr.u64 d10,d14,#16
+ vadd.u64 d15,d15,d10
+ vshr.u64 d10,d15,#16
+ vzip.16 d14,d15
+ vadd.u64 d16,d16,d10
+ vst1.32 {d14[0]}, [r7, :32]!
+ vshr.u64 d10,d16,#16
+ vadd.u64 d17,d17,d10
+ vshr.u64 d10,d17,#16
+ vzip.16 d16,d17
+ vadd.u64 d18,d18,d10
+ vst1.32 {d16[0]}, [r7, :32]!
+ vshr.u64 d10,d18,#16
+ vadd.u64 d19,d19,d10
+ vshr.u64 d10,d19,#16
+ vzip.16 d18,d19
+ vadd.u64 d20,d20,d10
+ vst1.32 {d18[0]}, [r7, :32]!
+ vshr.u64 d10,d20,#16
+ vadd.u64 d21,d21,d10
+ vshr.u64 d10,d21,#16
+ vzip.16 d20,d21
+ vadd.u64 d22,d22,d10
+ vst1.32 {d20[0]}, [r7, :32]!
+ vshr.u64 d10,d22,#16
+ vadd.u64 d23,d23,d10
+ vshr.u64 d10,d23,#16
+ vzip.16 d22,d23
+ vadd.u64 d24,d24,d10
+ vst1.32 {d22[0]}, [r7, :32]!
+ vshr.u64 d10,d24,#16
+ vadd.u64 d25,d25,d10
+ vshr.u64 d10,d25,#16
+ vzip.16 d24,d25
+ vadd.u64 d26,d26,d10
+ vst1.32 {d24[0]}, [r7, :32]!
+ vshr.u64 d10,d26,#16
+ vadd.u64 d27,d27,d10
+ vshr.u64 d10,d27,#16
+ vzip.16 d26,d27
+ vld1.64 {q6,q7}, [r6, :256]!
+ subs r8,r8,#8
+ vst1.32 {d26[0]}, [r7, :32]!
+ bne LNEON_tail
+
+ vst1.32 {d10[0]}, [r7, :32] @ top-most bit
+ sub r3,r3,r5,lsl#2 @ rewind r3
+ subs r1,sp,#0 @ clear carry flag
+ add r2,sp,r5,lsl#2
+
+LNEON_sub:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r3!, {r8,r9,r10,r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne LNEON_sub
+
+ ldr r10, [r1] @ load top-most bit
+ mov r11,sp
+ veor q0,q0,q0
+ sub r11,r2,r11 @ this is num*4
+ veor q1,q1,q1
+ mov r1,sp
+ sub r0,r0,r11 @ rewind r0
+ mov r3,r2 @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+LNEON_copy_n_zap:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ ldmia r1, {r4,r5,r6,r7}
+ stmia r0!, {r8,r9,r10,r11}
+ sub r1,r1,#16
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r1,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne LNEON_copy_n_zap
+
+ mov sp,ip
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ bx lr @ bx lr
+
+#endif
+.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm _OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol _OPENSSL_armcap_P
+.long 0
+.private_extern _OPENSSL_armcap_P
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/bsaes-armv7.S b/apple-arm/crypto/fipsmodule/bsaes-armv7.S
new file mode 100644
index 0000000..8329a8c
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -0,0 +1,1536 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code 32
+# undef __thumb2__
+#endif
+
+#ifdef __thumb2__
+.thumb_func _bsaes_decrypt8
+#endif
+.align 4
+_bsaes_decrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,LM0ISR
+#else
+ add r6,r6,#LM0ISR-_bsaes_decrypt8
+#endif
+
+ vldmia r6!, {q8} @ LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose LBS0
+ vmov.i8 q9,#0x33 @ compose LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b Ldec_sbox
+.align 4
+Ldec_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne Ldec_loop
+ vldmia r6, {q12} @ LISRM0
+ b Ldec_loop
+.align 4
+Ldec_done:
+ vmov.i8 q8,#0x55 @ compose LBS0
+ vmov.i8 q9,#0x33 @ compose LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+
+
+
+.align 6
+_bsaes_const:
+LM0ISR:@ InvShiftRows constants
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+LM0SR:@ ShiftRows constants
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+LM0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 6
+
+
+#ifdef __thumb2__
+.thumb_func _bsaes_encrypt8
+#endif
+.align 4
+_bsaes_encrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,LM0SR
+#else
+ sub r6,r6,#_bsaes_encrypt8-LM0SR
+#endif
+
+ vldmia r6!, {q8} @ LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose LBS0
+ vmov.i8 q9,#0x33 @ compose LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b Lenc_sbox
+.align 4
+Lenc_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne Lenc_loop
+ vldmia r6, {q12} @ LSRM0
+ b Lenc_loop
+.align 4
+Lenc_done:
+ vmov.i8 q8,#0x55 @ compose LBS0
+ vmov.i8 q9,#0x33 @ compose LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+
+#ifdef __thumb2__
+.thumb_func _bsaes_key_convert
+#endif
+.align 4
+_bsaes_key_convert:
+ adr r6,.
+ vld1.8 {q7}, [r4]! @ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,LM0
+#else
+ sub r6,r6,#_bsaes_key_convert-LM0
+#endif
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b Lkey_loop
+
+.align 4
+Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
+ bne Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose L63
+ @ don't save last round key
+ bx lr
+
+.globl _bsaes_cbc_encrypt
+.private_extern _bsaes_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func _bsaes_cbc_encrypt
+#endif
+.align 5
+_bsaes_cbc_encrypt:
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b Lcbc_dec_loop
+
+.align 4
+Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi Lcbc_dec_loop_finish
+
+ vld1.8 {q0,q1}, [r0]! @ load input
+ vld1.8 {q2,q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4,q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6,q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b Lcbc_dec_loop
+
+Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq Lcbc_dec_done
+
+ @ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+ beq Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0,q1}, [r1]! @ write output
+ b Lcbc_dec_done
+.align 4
+Lcbc_dec_one:
+ sub r0, r0, #0x10
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vst1.8 {q0}, [r1]! @ write output
+
+Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+Lcbc_dec_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+
+.globl _bsaes_ctr32_encrypt_blocks
+.private_extern _bsaes_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func _bsaes_ctr32_encrypt_blocks
+#endif
+.align 5
+_bsaes_ctr32_encrypt_blocks:
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+#ifdef __APPLE__
+ mov r8, #:lower16:(LREVM0SR-LM0)
+ add r8, r6, r8
+#else
+ add r8, r6, #LREVM0SR-LM0 @ borrow r8
+#endif
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+ add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b Lctr_enc_loop
+
+.align 4
+Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+#ifdef __APPLE__
+ mov r6, #:lower16:(LREVM0SR-LSR)
+ sub r6, r8, r6
+#else
+ sub r6, r8, #LREVM0SR-LSR @ pass constants
+#endif
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo Lctr_enc_loop_done
+
+ vld1.8 {q8,q9}, [r0]! @ load input
+ vld1.8 {q10,q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne Lctr_enc_loop
+ b Lctr_enc_done
+
+.align 4
+Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+Lctr_enc_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne Lctr_enc_bzero
+#else
+ vstmia sp, {q0,q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
+
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
+
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/ghash-armv4.S b/apple-arm/crypto/fipsmodule/ghash-armv4.S
new file mode 100644
index 0000000..36f4cce
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/ghash-armv4.S
@@ -0,0 +1,258 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl _gcm_init_neon
+.private_extern _gcm_init_neon
+#ifdef __thumb2__
+.thumb_func _gcm_init_neon
+#endif
+.align 4
+_gcm_init_neon:
+ vld1.64 d7,[r1]! @ load H
+ vmov.i8 q8,#0xe1
+ vld1.64 d6,[r1]
+ vshl.i64 d17,#57
+ vshr.u64 d16,#63 @ t0=0xc2....01
+ vdup.8 q9,d7[7]
+ vshr.u64 d26,d6,#63
+ vshr.s8 q9,#7 @ broadcast carry bit
+ vshl.i64 q3,q3,#1
+ vand q8,q8,q9
+ vorr d7,d26 @ H<<<=1
+ veor q3,q3,q8 @ twisted H
+ vstmia r0,{q3}
+
+ bx lr @ bx lr
+
+
+.globl _gcm_gmult_neon
+.private_extern _gcm_gmult_neon
+#ifdef __thumb2__
+.thumb_func _gcm_gmult_neon
+#endif
+.align 4
+_gcm_gmult_neon:
+ vld1.64 d7,[r0]! @ load Xi
+ vld1.64 d6,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+ mov r3,#16
+ b Lgmult_neon
+
+
+.globl _gcm_ghash_neon
+.private_extern _gcm_ghash_neon
+#ifdef __thumb2__
+.thumb_func _gcm_ghash_neon
+#endif
+.align 4
+_gcm_ghash_neon:
+ vld1.64 d1,[r0]! @ load Xi
+ vld1.64 d0,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+
+Loop_neon:
+ vld1.64 d7,[r2]! @ load inp
+ vld1.64 d6,[r2]!
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ veor q3,q0 @ inp^=Xi
+Lgmult_neon:
+ vext.8 d16, d26, d26, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d0, d6, d6, #1 @ B1
+ vmull.p8 q0, d26, d0 @ E = A*B1
+ vext.8 d18, d26, d26, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d26, d22 @ G = A*B2
+ vext.8 d20, d26, d26, #3 @ A3
+ veor q8, q8, q0 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d0, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q0, d26, d0 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d26, d22 @ K = A*B4
+ veor q10, q10, q0 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q0, d26, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q0, q0, q8
+ veor q0, q0, q10
+ veor d6,d6,d7 @ Karatsuba pre-processing
+ vext.8 d16, d28, d28, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d2, d6, d6, #1 @ B1
+ vmull.p8 q1, d28, d2 @ E = A*B1
+ vext.8 d18, d28, d28, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d28, d22 @ G = A*B2
+ vext.8 d20, d28, d28, #3 @ A3
+ veor q8, q8, q1 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d2, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q1, d28, d2 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d28, d22 @ K = A*B4
+ veor q10, q10, q1 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q1, d28, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q1, q1, q8
+ veor q1, q1, q10
+ vext.8 d16, d27, d27, #1 @ A1
+ vmull.p8 q8, d16, d7 @ F = A1*B
+ vext.8 d4, d7, d7, #1 @ B1
+ vmull.p8 q2, d27, d4 @ E = A*B1
+ vext.8 d18, d27, d27, #2 @ A2
+ vmull.p8 q9, d18, d7 @ H = A2*B
+ vext.8 d22, d7, d7, #2 @ B2
+ vmull.p8 q11, d27, d22 @ G = A*B2
+ vext.8 d20, d27, d27, #3 @ A3
+ veor q8, q8, q2 @ L = E + F
+ vmull.p8 q10, d20, d7 @ J = A3*B
+ vext.8 d4, d7, d7, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q2, d27, d4 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d7, d7, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d27, d22 @ K = A*B4
+ veor q10, q10, q2 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q2, d27, d7 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q2, q2, q8
+ veor q2, q2, q10
+ veor q1,q1,q0 @ Karatsuba post-processing
+ veor q1,q1,q2
+ veor d1,d1,d2
+ veor d4,d4,d3 @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 q9,q0,#57 @ 1st phase
+ vshl.i64 q10,q0,#62
+ veor q10,q10,q9 @
+ vshl.i64 q9,q0,#63
+ veor q10, q10, q9 @
+ veor d1,d1,d20 @
+ veor d4,d4,d21
+
+ vshr.u64 q10,q0,#1 @ 2nd phase
+ veor q2,q2,q0
+ veor q0,q0,q10 @
+ vshr.u64 q10,q10,#6
+ vshr.u64 q0,q0,#1 @
+ veor q0,q0,q2 @
+ veor q0,q0,q10 @
+
+ subs r3,#16
+ bne Loop_neon
+
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ sub r0,#16
+ vst1.64 d1,[r0]! @ write out Xi
+ vst1.64 d0,[r0]
+
+ bx lr @ bx lr
+
+#endif
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/ghashv8-armx32.S b/apple-arm/crypto/fipsmodule/ghashv8-armx32.S
new file mode 100644
index 0000000..dcac580
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/ghashv8-armx32.S
@@ -0,0 +1,260 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.code 32
+#undef __thumb2__
+.globl _gcm_init_v8
+.private_extern _gcm_init_v8
+#ifdef __thumb2__
+.thumb_func _gcm_init_v8
+#endif
+.align 4
+_gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r1] @ load input H
+ vmov.i8 q11,#0xe1
+ vshl.i64 q11,q11,#57 @ 0xc2.0
+ vext.8 q3,q9,q9,#8
+ vshr.u64 q10,q11,#63
+ vdup.32 q9,d18[1]
+ vext.8 q8,q10,q11,#8 @ t0=0xc2....01
+ vshr.u64 q10,q3,#63
+ vshr.s32 q9,q9,#31 @ broadcast carry bit
+ vand q10,q10,q8
+ vshl.i64 q3,q3,#1
+ vext.8 q10,q10,q10,#8
+ vand q8,q8,q9
+ vorr q3,q3,q10 @ H<<<=1
+ veor q12,q3,q8 @ twisted H
+ vst1.64 {q12},[r0]! @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
+.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
+ veor q8,q8,q12
+.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
+.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q14,q0,q10
+
+ vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
+ veor q9,q9,q14
+ vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
+ vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
+ bx lr
+
+.globl _gcm_gmult_v8
+.private_extern _gcm_gmult_v8
+#ifdef __thumb2__
+.thumb_func _gcm_gmult_v8
+#endif
+.align 4
+_gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r0] @ load Xi
+ vmov.i8 q11,#0xe1
+ vld1.64 {q12,q13},[r1] @ load twisted H, ...
+ vshl.u64 q11,q11,#57
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q3,q9,q9,#8
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ bx lr
+
+.globl _gcm_ghash_v8
+.private_extern _gcm_ghash_v8
+#ifdef __thumb2__
+.thumb_func _gcm_ghash_v8
+#endif
+.align 4
+_gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ vld1.64 {q0},[r0] @ load [rotated] Xi
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ algorithm specification
+ subs r3,r3,#32 @ see if r3 is 32 or larger
+ mov r12,#16 @ r12 is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ r12 is zeroed just in time
+ @ to preclude overstepping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
+ vmov.i8 q11,#0xe1
+ vld1.64 {q14},[r1]
+ moveq r12,#0 @ is it time to zero r12?
+ vext.8 q0,q0,q0,#8 @ rotate Xi
+ vld1.64 {q8},[r2]! @ load [rotated] I[0]
+ vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+ vrev64.8 q0,q0
+#endif
+ vext.8 q3,q8,q8,#8 @ rotate I[0]
+ blo Lodd_tail_v8 @ r3 was less than 32
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q7,q9,q9,#8
+ veor q3,q3,q0 @ I[i]^=Xi
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q9,q9,q7 @ Karatsuba pre-processing
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ b Loop_mod2x_v8
+
+.align 4
+Loop_mod2x_v8:
+ vext.8 q10,q3,q3,#8
+ subs r3,r3,#32 @ is there more data?
+.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
+ movlo r12,#0 @ is it time to zero r12?
+
+.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
+ veor q10,q10,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
+ veor q0,q0,q4 @ accumulate
+.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
+
+ veor q2,q2,q6
+ moveq r12,#0 @ is it time to zero r12?
+ veor q1,q1,q5
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+#endif
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ vext.8 q7,q9,q9,#8
+ vext.8 q3,q8,q8,#8
+ veor q0,q1,q10
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q3,q3,q2 @ accumulate q3 early
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q3,q3,q10
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ veor q3,q3,q0
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ bhs Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor q2,q2,q10
+ vext.8 q3,q8,q8,#8 @ re-construct q3
+ adds r3,r3,#32 @ re-construct r3
+ veor q0,q0,q2 @ re-construct q0
+ beq Ldone_v8 @ is r3 zero?
+Lodd_tail_v8:
+ vext.8 q10,q0,q0,#8
+ veor q3,q3,q0 @ inp^=Xi
+ veor q9,q8,q10 @ q9 is rotated inp^Xi
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+Ldone_v8:
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ bx lr
+
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha1-armv4-large.S b/apple-arm/crypto/fipsmodule/sha1-armv4-large.S
new file mode 100644
index 0000000..82ac8df
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha1-armv4-large.S
@@ -0,0 +1,1518 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl _sha1_block_data_order
+.private_extern _sha1_block_data_order
+#ifdef __thumb2__
+.thumb_func _sha1_block_data_order
+#endif
+
+.align 5
+_sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+Lsha1_block:
+ adr r3,Lsha1_block
+ ldr r12,LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV8_SHA1
+ bne LARMv8
+ tst r12,#ARMV7_NEON
+ bne LNEON
+#endif
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ ldmia r0,{r3,r4,r5,r6,r7}
+Lloop:
+ ldr r8,LK_00_19
+ mov r14,sp
+ sub sp,sp,#15*4
+ mov r5,r5,ror#30
+ mov r6,r6,ror#30
+ mov r7,r7,ror#30 @ [6]
+L_00_15:
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ eor r10,r4,r5 @ F_xx_xx
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r3,r10,ror#2
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ eor r10,r3,r4 @ F_xx_xx
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r7,r10,ror#2
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ eor r10,r7,r3 @ F_xx_xx
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r6,r10,ror#2
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ eor r10,r6,r7 @ F_xx_xx
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r5,r10,ror#2
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne L_00_15 @ [((11+4)*5+2)*3]
+ sub sp,sp,#25*4
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+
+ ldr r8,LK_20_39 @ [+15+16*4]
+ cmn sp,#0 @ [+3], clear carry to denote 20_39
+L_20_39_or_60_79:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r4,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp @ preserve carry
+#endif
+ bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
+ bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes
+
+ ldr r8,LK_40_59
+ sub sp,sp,#20*4 @ [+2]
+L_40_59:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r4,r10,ror#2 @ F_xx_xx
+ and r11,r5,r6 @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_40_59(B,C,D)
+ add r7,r7,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ and r11,r4,r5 @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_40_59(B,C,D)
+ add r6,r6,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ and r11,r3,r4 @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_40_59(B,C,D)
+ add r5,r5,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ and r11,r7,r3 @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_40_59(B,C,D)
+ add r4,r4,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ and r11,r6,r7 @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_40_59(B,C,D)
+ add r3,r3,r11,ror#2
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne L_40_59 @ [+((12+5)*5+2)*4]
+
+ ldr r8,LK_60_79
+ sub sp,sp,#20*4
+ cmp sp,#0 @ set carry to denote 60_79
+ b L_20_39_or_60_79 @ [+4], spare 300 bytes
+L_done:
+ add sp,sp,#80*4 @ "deallocate" stack frame
+ ldmia r0,{r8,r9,r10,r11,r12}
+ add r3,r8,r3
+ add r4,r9,r4
+ add r5,r10,r5,ror#2
+ add r6,r11,r6,ror#2
+ add r7,r12,r7,ror#2
+ stmia r0,{r3,r4,r5,r6,r7}
+ teq r1,r2
+ bne Lloop @ [+18], total 1307
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+
+
+.align 5
+LK_00_19:.word 0x5a827999
+LK_20_39:.word 0x6ed9eba1
+LK_40_59:.word 0x8f1bbcdc
+LK_60_79:.word 0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+LOPENSSL_armcap:
+.word OPENSSL_armcap_P-Lsha1_block
+#endif
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 5
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func sha1_block_data_order_neon
+#endif
+.align 4
+sha1_block_data_order_neon:
+LNEON:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ @ dmb @ errata #451034 on early Cortex A8
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
+ mov r14,sp
+ sub r12,sp,#64
+ adr r8,LK_00_19
+ bic r12,r12,#15 @ align for 128-bit stores
+
+ ldmia r0,{r3,r4,r5,r6,r7} @ load context
+ mov sp,r12 @ alloca
+
+ vld1.8 {q0,q1},[r1]! @ handles unaligned
+ veor q15,q15,q15
+ vld1.8 {q2,q3},[r1]!
+ vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19
+ vrev32.8 q0,q0 @ yes, even on
+ vrev32.8 q1,q1 @ big-endian...
+ vrev32.8 q2,q2
+ vadd.i32 q8,q0,q14
+ vrev32.8 q3,q3
+ vadd.i32 q9,q1,q14
+ vst1.32 {q8},[r12,:128]!
+ vadd.i32 q10,q2,q14
+ vst1.32 {q9},[r12,:128]!
+ vst1.32 {q10},[r12,:128]!
+ ldr r9,[sp] @ big RAW stall
+
+Loop_neon:
+ vext.8 q8,q0,q1,#8
+ bic r10,r6,r4
+ add r7,r7,r9
+ and r11,r5,r4
+ vadd.i32 q13,q3,q14
+ ldr r9,[sp,#4]
+ add r7,r7,r3,ror#27
+ vext.8 q12,q3,q15,#4
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q8,q8,q0
+ bic r10,r5,r3
+ add r6,r6,r9
+ veor q12,q12,q2
+ and r11,r4,r3
+ ldr r9,[sp,#8]
+ veor q12,q12,q8
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r4,r7
+ add r5,r5,r9
+ vadd.i32 q8,q12,q12
+ and r11,r3,r7
+ ldr r9,[sp,#12]
+ vsri.32 q8,q12,#31
+ add r5,r5,r6,ror#27
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vshr.u32 q12,q13,#30
+ add r5,r5,r11
+ bic r10,r3,r6
+ vshl.u32 q13,q13,#2
+ add r4,r4,r9
+ and r11,r7,r6
+ veor q8,q8,q12
+ ldr r9,[sp,#16]
+ add r4,r4,r5,ror#27
+ veor q8,q8,q13
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q9,q1,q2,#8
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ vadd.i32 q13,q8,q14
+ ldr r9,[sp,#20]
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r4,ror#27
+ vext.8 q12,q8,q15,#4
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ veor q9,q9,q1
+ bic r10,r6,r4
+ add r7,r7,r9
+ veor q12,q12,q3
+ and r11,r5,r4
+ ldr r9,[sp,#24]
+ veor q12,q12,q9
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r5,r3
+ add r6,r6,r9
+ vadd.i32 q9,q12,q12
+ and r11,r4,r3
+ ldr r9,[sp,#28]
+ vsri.32 q9,q12,#31
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vshr.u32 q12,q13,#30
+ add r6,r6,r11
+ bic r10,r4,r7
+ vshl.u32 q13,q13,#2
+ add r5,r5,r9
+ and r11,r3,r7
+ veor q9,q9,q12
+ ldr r9,[sp,#32]
+ add r5,r5,r6,ror#27
+ veor q9,q9,q13
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q10,q2,q3,#8
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ vadd.i32 q13,q9,q14
+ ldr r9,[sp,#36]
+ add r4,r4,r5,ror#27
+ vext.8 q12,q9,q15,#4
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q10,q10,q2
+ bic r10,r7,r5
+ add r3,r3,r9
+ veor q12,q12,q8
+ and r11,r6,r5
+ ldr r9,[sp,#40]
+ veor q12,q12,q10
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r6,r4
+ add r7,r7,r9
+ vadd.i32 q10,q12,q12
+ and r11,r5,r4
+ ldr r9,[sp,#44]
+ vsri.32 q10,q12,#31
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ vshr.u32 q12,q13,#30
+ add r7,r7,r11
+ bic r10,r5,r3
+ vshl.u32 q13,q13,#2
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q10,q10,q12
+ ldr r9,[sp,#48]
+ add r6,r6,r7,ror#27
+ veor q10,q10,q13
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q11,q3,q8,#8
+ bic r10,r4,r7
+ add r5,r5,r9
+ and r11,r3,r7
+ vadd.i32 q13,q10,q14
+ ldr r9,[sp,#52]
+ add r5,r5,r6,ror#27
+ vext.8 q12,q10,q15,#4
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q11,q11,q3
+ bic r10,r3,r6
+ add r4,r4,r9
+ veor q12,q12,q9
+ and r11,r7,r6
+ ldr r9,[sp,#56]
+ veor q12,q12,q11
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r7,r5
+ add r3,r3,r9
+ vadd.i32 q11,q12,q12
+ and r11,r6,r5
+ ldr r9,[sp,#60]
+ vsri.32 q11,q12,#31
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ vshr.u32 q12,q13,#30
+ add r3,r3,r11
+ bic r10,r6,r4
+ vshl.u32 q13,q13,#2
+ add r7,r7,r9
+ and r11,r5,r4
+ veor q11,q11,q12
+ ldr r9,[sp,#0]
+ add r7,r7,r3,ror#27
+ veor q11,q11,q13
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q10,q11,#8
+ bic r10,r5,r3
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q0,q0,q8
+ ldr r9,[sp,#4]
+ add r6,r6,r7,ror#27
+ veor q0,q0,q1
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vadd.i32 q13,q11,q14
+ add r6,r6,r11
+ bic r10,r4,r7
+ veor q12,q12,q0
+ add r5,r5,r9
+ and r11,r3,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vsli.32 q0,q12,#2
+ add r5,r5,r11
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ ldr r9,[sp,#12]
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ ldr r9,[sp,#16]
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q11,q0,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q1,q1,q2
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q0,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q1
+ ldr r9,[sp,#24]
+ eor r11,r10,r4
+ vshr.u32 q1,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q1,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q0,q1,#8
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ veor q2,q2,q3
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vadd.i32 q13,q1,q14
+ eor r10,r4,r6
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r7,r7,r9
+ veor q12,q12,q2
+ ldr r9,[sp,#40]
+ eor r11,r10,r5
+ vshr.u32 q2,q12,#30
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r7,r7,r11
+ eor r10,r3,r5
+ vsli.32 q2,q12,#2
+ add r6,r6,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ veor q3,q3,q8
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r6
+ vshr.u32 q3,q12,#30
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vsli.32 q3,q12,#2
+ add r7,r7,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q2,q3,#8
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#4]
+ veor q8,q8,q0
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ veor q8,q8,q9
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r6,r3
+ add r4,r4,r9
+ veor q12,q12,q8
+ ldr r9,[sp,#8]
+ eor r11,r10,r7
+ vshr.u32 q8,q12,#30
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ add r4,r4,r11
+ eor r10,r5,r7
+ vsli.32 q8,q12,#2
+ add r3,r3,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q3,q8,#8
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#20]
+ veor q9,q9,q1
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ veor q9,q9,q10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vadd.i32 q13,q8,q14
+ eor r10,r7,r4
+ add r5,r5,r9
+ veor q12,q12,q9
+ ldr r9,[sp,#24]
+ eor r11,r10,r3
+ vshr.u32 q9,q12,#30
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r5,r5,r11
+ eor r10,r6,r3
+ vsli.32 q9,q12,#2
+ add r4,r4,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q8,q9,#8
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#36]
+ veor q10,q10,q2
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ veor q10,q10,q11
+ add r7,r7,r10
+ and r11,r11,r4
+ vadd.i32 q13,q9,q14
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q12,q12,q10
+ add r6,r6,r9
+ and r10,r4,r5
+ vshr.u32 q10,q12,#30
+ ldr r9,[sp,#40]
+ add r6,r6,r7,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r4,r5
+ add r6,r6,r10
+ vsli.32 q10,q12,#2
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#44]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#48]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q9,q10,#8
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#52]
+ veor q11,q11,q3
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ veor q11,q11,q0
+ add r3,r3,r10
+ and r11,r11,r5
+ vadd.i32 q13,q10,q14
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ veor q12,q12,q11
+ add r7,r7,r9
+ and r10,r5,r6
+ vshr.u32 q11,q12,#30
+ ldr r9,[sp,#56]
+ add r7,r7,r3,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r5,r6
+ add r7,r7,r10
+ vsli.32 q11,q12,#2
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#60]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#0]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q10,q11,#8
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#4]
+ veor q0,q0,q8
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ veor q0,q0,q1
+ add r4,r4,r10
+ and r11,r11,r6
+ vadd.i32 q13,q11,q14
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q12,q12,q0
+ add r3,r3,r9
+ and r10,r6,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r3,r3,r4,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r6,r7
+ add r3,r3,r10
+ vsli.32 q0,q12,#2
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#12]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#16]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q11,q0,#8
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ veor q1,q1,q2
+ add r5,r5,r10
+ and r11,r11,r7
+ vadd.i32 q13,q0,q14
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q12,q12,q1
+ add r4,r4,r9
+ and r10,r7,r3
+ vshr.u32 q1,q12,#30
+ ldr r9,[sp,#24]
+ add r4,r4,r5,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r7,r3
+ add r4,r4,r10
+ vsli.32 q1,q12,#2
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#28]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#32]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q0,q1,#8
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ veor q2,q2,q3
+ add r6,r6,r10
+ and r11,r11,r3
+ vadd.i32 q13,q1,q14
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ veor q12,q12,q2
+ add r5,r5,r9
+ and r10,r3,r4
+ vshr.u32 q2,q12,#30
+ ldr r9,[sp,#40]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r3,r4
+ add r5,r5,r10
+ vsli.32 q2,q12,#2
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#44]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#48]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q3,q3,q8
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r4
+ vshr.u32 q3,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q3,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ teq r1,r2
+ sub r8,r8,#16
+ it eq
+ subeq r1,r1,#64
+ vld1.8 {q0,q1},[r1]!
+ ldr r9,[sp,#4]
+ eor r11,r10,r6
+ vld1.8 {q2,q3},[r1]!
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vrev32.8 q0,q0
+ add r7,r7,r9
+ ldr r9,[sp,#8]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vrev32.8 q1,q1
+ eor r10,r6,r3
+ add r4,r4,r9
+ vadd.i32 q8,q0,q14
+ ldr r9,[sp,#20]
+ eor r11,r10,r7
+ vst1.32 {q8},[r12,:128]!
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#24]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vrev32.8 q2,q2
+ eor r10,r7,r4
+ add r5,r5,r9
+ vadd.i32 q9,q1,q14
+ ldr r9,[sp,#36]
+ eor r11,r10,r3
+ vst1.32 {q9},[r12,:128]!
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#40]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vrev32.8 q3,q3
+ eor r10,r3,r5
+ add r6,r6,r9
+ vadd.i32 q10,q2,q14
+ ldr r9,[sp,#52]
+ eor r11,r10,r4
+ vst1.32 {q10},[r12,:128]!
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#56]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ ldmia r0,{r9,r10,r11,r12} @ accumulate context
+ add r3,r3,r9
+ ldr r9,[r0,#16]
+ add r4,r4,r10
+ add r5,r5,r11
+ add r6,r6,r12
+ it eq
+ moveq sp,r14
+ add r7,r7,r9
+ it ne
+ ldrne r9,[sp]
+ stmia r0,{r3,r4,r5,r6,r7}
+ itt ne
+ addne r12,sp,#3*16
+ bne Loop_neon
+
+ @ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
+#ifdef __thumb2__
+.thumb_func sha1_block_data_order_armv8
+#endif
+.align 5
+sha1_block_data_order_armv8:
+LARMv8:
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+
+ veor q1,q1,q1
+ adr r3,LK_00_19
+ vld1.32 {q0},[r0]!
+ vld1.32 {d2[0]},[r0]
+ sub r0,r0,#16
+ vld1.32 {d16[],d17[]},[r3,:32]!
+ vld1.32 {d18[],d19[]},[r3,:32]!
+ vld1.32 {d20[],d21[]},[r3,:32]!
+ vld1.32 {d22[],d23[]},[r3,:32]
+
+Loop_v8:
+ vld1.8 {q4,q5},[r1]!
+ vld1.8 {q6,q7},[r1]!
+ vrev32.8 q4,q4
+ vrev32.8 q5,q5
+
+ vadd.i32 q12,q8,q4
+ vrev32.8 q6,q6
+ vmov q14,q0 @ offload
+ subs r2,r2,#1
+
+ vadd.i32 q13,q8,q5
+ vrev32.8 q7,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0
+ INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12
+ vadd.i32 q12,q8,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q8,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q8,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q9,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q9,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q10,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q10,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q11,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q11,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q11,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q7
+
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+
+ vadd.i32 q1,q1,q2
+ vadd.i32 q0,q0,q14
+ bne Loop_v8
+
+ vst1.32 {q0},[r0]!
+ vst1.32 {d2[0]},[r0]
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ bx lr @ bx lr
+
+#endif
+#if __ARM_MAX_ARCH__>=7
+.comm _OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol _OPENSSL_armcap_P
+.long 0
+.private_extern _OPENSSL_armcap_P
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha256-armv4.S b/apple-arm/crypto/fipsmodule/sha256-armv4.S
new file mode 100644
index 0000000..0cf3648
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha256-armv4.S
@@ -0,0 +1,2846 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+LOPENSSL_armcap:
+.word OPENSSL_armcap_P-Lsha256_block_data_order
+#endif
+.align 5
+
+.globl _sha256_block_data_order
+.private_extern _sha256_block_data_order
+#ifdef __thumb2__
+.thumb_func _sha256_block_data_order
+#endif
+_sha256_block_data_order:
+Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ _sha256_block_data_order
+#else
+ adr r3,Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV8_SHA256
+ bne LARMv8
+ tst r12,#ARMV7_NEON
+ bne LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl _sha256_block_data_order_neon
+.private_extern _sha256_block_data_order_neon
+#ifdef __thumb2__
+.thumb_func _sha256_block_data_order_neon
+#endif
+.align 5
+.skip 16
+_sha256_block_data_order_neon:
+LNEON:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b L_00_48
+
+.align 4
+L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8,r9,r10,r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne L_00_48
+
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+#ifdef __thumb2__
+.thumb_func sha256_block_data_order_armv8
+#endif
+.align 5
+sha256_block_data_order_armv8:
+LARMv8:
+ vld1.32 {q0,q1},[r0]
+ sub r3,r3,#256+32
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ b Loop_v8
+
+.align 4
+Loop_v8:
+ vld1.8 {q8,q9},[r1]!
+ vld1.8 {q10,q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+
+#endif
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm _OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol _OPENSSL_armcap_P
+.long 0
+.private_extern _OPENSSL_armcap_P
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha512-armv4.S b/apple-arm/crypto/fipsmodule/sha512-armv4.S
new file mode 100644
index 0000000..21913cb
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha512-armv4.S
@@ -0,0 +1,1899 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code 32
+#endif
+
+
+.align 5
+K512:
+ WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+ WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+ WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+ WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+ WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+ WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+ WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+ WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+ WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+ WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+ WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+ WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+ WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+ WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+ WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+ WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+ WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+ WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+ WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+ WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+ WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+ WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+ WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+ WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+ WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+ WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+ WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+ WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+ WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+ WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+ WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+ WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+ WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+ WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+ WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+ WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+ WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+LOPENSSL_armcap:
+.word OPENSSL_armcap_P-Lsha512_block_data_order
+.skip 32-4
+#else
+.skip 32
+#endif
+
+.globl _sha512_block_data_order
+.private_extern _sha512_block_data_order
+#ifdef __thumb2__
+.thumb_func _sha512_block_data_order
+#endif
+_sha512_block_data_order:
+Lsha512_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ _sha512_block_data_order
+#else
+ adr r3,Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV7_NEON
+ bne LNEON
+#endif
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ sub r14,r3,#672 @ K512
+ sub sp,sp,#9*8
+
+ ldr r7,[r0,#32+LO]
+ ldr r8,[r0,#32+HI]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+Loop:
+ str r9, [sp,#48+0]
+ str r10, [sp,#48+4]
+ str r11, [sp,#56+0]
+ str r12, [sp,#56+4]
+ ldr r5,[r0,#0+LO]
+ ldr r6,[r0,#0+HI]
+ ldr r3,[r0,#8+LO]
+ ldr r4,[r0,#8+HI]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ str r3,[sp,#8+0]
+ str r4,[sp,#8+4]
+ str r9, [sp,#16+0]
+ str r10, [sp,#16+4]
+ str r11, [sp,#24+0]
+ str r12, [sp,#24+4]
+ ldr r3,[r0,#40+LO]
+ ldr r4,[r0,#40+HI]
+ str r3,[sp,#40+0]
+ str r4,[sp,#40+4]
+
+L00_15:
+#if __ARM_ARCH__<7
+ ldrb r3,[r1,#7]
+ ldrb r9, [r1,#6]
+ ldrb r10, [r1,#5]
+ ldrb r11, [r1,#4]
+ ldrb r4,[r1,#3]
+ ldrb r12, [r1,#2]
+ orr r3,r3,r9,lsl#8
+ ldrb r9, [r1,#1]
+ orr r3,r3,r10,lsl#16
+ ldrb r10, [r1],#8
+ orr r3,r3,r11,lsl#24
+ orr r4,r4,r12,lsl#8
+ orr r4,r4,r9,lsl#16
+ orr r4,r4,r10,lsl#24
+#else
+ ldr r3,[r1,#4]
+ ldr r4,[r1],#8
+#ifdef __ARMEL__
+ rev r3,r3
+ rev r4,r4
+#endif
+#endif
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#148
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+ tst r14,#1
+ beq L00_15
+ ldr r9,[sp,#184+0]
+ ldr r10,[sp,#184+4]
+ bic r14,r14,#1
+L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov r3,r9,lsr#1
+ ldr r11,[sp,#80+0]
+ mov r4,r10,lsr#1
+ ldr r12,[sp,#80+4]
+ eor r3,r3,r10,lsl#31
+ eor r4,r4,r9,lsl#31
+ eor r3,r3,r9,lsr#8
+ eor r4,r4,r10,lsr#8
+ eor r3,r3,r10,lsl#24
+ eor r4,r4,r9,lsl#24
+ eor r3,r3,r9,lsr#7
+ eor r4,r4,r10,lsr#7
+ eor r3,r3,r10,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov r9,r11,lsr#19
+ mov r10,r12,lsr#19
+ eor r9,r9,r12,lsl#13
+ eor r10,r10,r11,lsl#13
+ eor r9,r9,r12,lsr#29
+ eor r10,r10,r11,lsr#29
+ eor r9,r9,r11,lsl#3
+ eor r10,r10,r12,lsl#3
+ eor r9,r9,r11,lsr#6
+ eor r10,r10,r12,lsr#6
+ ldr r11,[sp,#120+0]
+ eor r9,r9,r12,lsl#26
+
+ ldr r12,[sp,#120+4]
+ adds r3,r3,r9
+ ldr r9,[sp,#192+0]
+ adc r4,r4,r10
+
+ ldr r10,[sp,#192+4]
+ adds r3,r3,r11
+ adc r4,r4,r12
+ adds r3,r3,r9
+ adc r4,r4,r10
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#23
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+#if __ARM_ARCH__>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r9,[sp,#184+0]
+ ldreq r10,[sp,#184+4]
+ beq L16_79
+ bic r14,r14,#1
+
+ ldr r3,[sp,#8+0]
+ ldr r4,[sp,#8+4]
+ ldr r9, [r0,#0+LO]
+ ldr r10, [r0,#0+HI]
+ ldr r11, [r0,#8+LO]
+ ldr r12, [r0,#8+HI]
+ adds r9,r5,r9
+ str r9, [r0,#0+LO]
+ adc r10,r6,r10
+ str r10, [r0,#0+HI]
+ adds r11,r3,r11
+ str r11, [r0,#8+LO]
+ adc r12,r4,r12
+ str r12, [r0,#8+HI]
+
+ ldr r5,[sp,#16+0]
+ ldr r6,[sp,#16+4]
+ ldr r3,[sp,#24+0]
+ ldr r4,[sp,#24+4]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ adds r9,r5,r9
+ str r9, [r0,#16+LO]
+ adc r10,r6,r10
+ str r10, [r0,#16+HI]
+ adds r11,r3,r11
+ str r11, [r0,#24+LO]
+ adc r12,r4,r12
+ str r12, [r0,#24+HI]
+
+ ldr r3,[sp,#40+0]
+ ldr r4,[sp,#40+4]
+ ldr r9, [r0,#32+LO]
+ ldr r10, [r0,#32+HI]
+ ldr r11, [r0,#40+LO]
+ ldr r12, [r0,#40+HI]
+ adds r7,r7,r9
+ str r7,[r0,#32+LO]
+ adc r8,r8,r10
+ str r8,[r0,#32+HI]
+ adds r11,r3,r11
+ str r11, [r0,#40+LO]
+ adc r12,r4,r12
+ str r12, [r0,#40+HI]
+
+ ldr r5,[sp,#48+0]
+ ldr r6,[sp,#48+4]
+ ldr r3,[sp,#56+0]
+ ldr r4,[sp,#56+4]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+ adds r9,r5,r9
+ str r9, [r0,#48+LO]
+ adc r10,r6,r10
+ str r10, [r0,#48+HI]
+ adds r11,r3,r11
+ str r11, [r0,#56+LO]
+ adc r12,r4,r12
+ str r12, [r0,#56+HI]
+
+ add sp,sp,#640
+ sub r14,r14,#640
+
+ teq r1,r2
+ bne Loop
+
+ add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl _sha512_block_data_order_neon
+.private_extern _sha512_block_data_order_neon
+#ifdef __thumb2__
+.thumb_func _sha512_block_data_order_neon
+#endif
+.align 4
+_sha512_block_data_order_neon:
+LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ adr r3,K512
+ VFP_ABI_PUSH
+ vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
+Loop_neon:
+ vshr.u64 d24,d20,#14 @ 0
+#if 0<16
+ vld1.64 {d0},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 0>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+ vrev64.8 d0,d0
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 1
+#if 1<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 1>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+ vrev64.8 d1,d1
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 2
+#if 2<16
+ vld1.64 {d2},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 2>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+ vrev64.8 d2,d2
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 3
+#if 3<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 3>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+ vrev64.8 d3,d3
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 4
+#if 4<16
+ vld1.64 {d4},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 4>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+ vrev64.8 d4,d4
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 5
+#if 5<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 5>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+ vrev64.8 d5,d5
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 6
+#if 6<16
+ vld1.64 {d6},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 6>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+ vrev64.8 d6,d6
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 7
+#if 7<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 7>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+ vrev64.8 d7,d7
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 d24,d20,#14 @ 8
+#if 8<16
+ vld1.64 {d8},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 8>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+ vrev64.8 d8,d8
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 9
+#if 9<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 9>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+ vrev64.8 d9,d9
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 10
+#if 10<16
+ vld1.64 {d10},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 10>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+ vrev64.8 d10,d10
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 11
+#if 11<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 11>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+ vrev64.8 d11,d11
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 12
+#if 12<16
+ vld1.64 {d12},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 12>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+ vrev64.8 d12,d12
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 13
+#if 13<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 13>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+ vrev64.8 d13,d13
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 14
+#if 14<16
+ vld1.64 {d14},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 14>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+ vrev64.8 d14,d14
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 15
+#if 15<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 15>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+ vrev64.8 d15,d15
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ mov r12,#4
+L16_79_neon:
+ subs r12,#1
+ vshr.u64 q12,q7,#19
+ vshr.u64 q13,q7,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q7,#6
+ vsli.64 q12,q7,#45
+ vext.8 q14,q0,q1,#8 @ X[i+1]
+ vsli.64 q13,q7,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q0,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q4,q5,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q0,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q0,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 17
+#if 17<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 17>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q0,#19
+ vshr.u64 q13,q0,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q0,#6
+ vsli.64 q12,q0,#45
+ vext.8 q14,q1,q2,#8 @ X[i+1]
+ vsli.64 q13,q0,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q1,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q5,q6,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q1,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q1,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 19
+#if 19<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 19>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q1,#19
+ vshr.u64 q13,q1,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q1,#6
+ vsli.64 q12,q1,#45
+ vext.8 q14,q2,q3,#8 @ X[i+1]
+ vsli.64 q13,q1,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q2,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q6,q7,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q2,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q2,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 21
+#if 21<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 21>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q2,#19
+ vshr.u64 q13,q2,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q2,#6
+ vsli.64 q12,q2,#45
+ vext.8 q14,q3,q4,#8 @ X[i+1]
+ vsli.64 q13,q2,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q3,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q7,q0,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q3,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q3,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 23
+#if 23<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 23>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 q12,q3,#19
+ vshr.u64 q13,q3,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q3,#6
+ vsli.64 q12,q3,#45
+ vext.8 q14,q4,q5,#8 @ X[i+1]
+ vsli.64 q13,q3,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q4,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q0,q1,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q4,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q4,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 25
+#if 25<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 25>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q4,#19
+ vshr.u64 q13,q4,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q4,#6
+ vsli.64 q12,q4,#45
+ vext.8 q14,q5,q6,#8 @ X[i+1]
+ vsli.64 q13,q4,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q5,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q1,q2,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q5,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q5,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 27
+#if 27<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 27>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q5,#19
+ vshr.u64 q13,q5,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q5,#6
+ vsli.64 q12,q5,#45
+ vext.8 q14,q6,q7,#8 @ X[i+1]
+ vsli.64 q13,q5,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q6,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q2,q3,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q6,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q6,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 29
+#if 29<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 29>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q6,#19
+ vshr.u64 q13,q6,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q6,#6
+ vsli.64 q12,q6,#45
+ vext.8 q14,q7,q0,#8 @ X[i+1]
+ vsli.64 q13,q6,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q7,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q3,q4,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q7,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q7,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 31
+#if 31<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 31>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ bne L16_79_neon
+
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
+ teq r1,r2
+ sub r3,#640 @ rewind K512
+ bne Loop_neon
+
+ VFP_ABI_POP
+ bx lr @ .word 0xe12fff1e
+
+#endif
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm _OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol _OPENSSL_armcap_P
+.long 0
+.private_extern _OPENSSL_armcap_P
+#endif
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/vpaes-armv7.S b/apple-arm/crypto/fipsmodule/vpaes-armv7.S
new file mode 100644
index 0000000..6aead7c
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/vpaes-armv7.S
@@ -0,0 +1,1265 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax unified
+
+
+
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+
+.align 7 @ totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:@ mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:@ mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:@ sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+Lk_inv:@ inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:@ input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:@ sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:@ sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:@ sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+
+.align 6
+@@
+@@ _aes_preheat
+@@
+@@ Fills q9-q15 as specified below.
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_preheat
+#endif
+.align 4
+_vpaes_preheat:
+ adr r10, Lk_inv
+ vmov.i8 q9, #0x0f @ Lk_s0F
+ vld1.64 {q10,q11}, [r10]! @ Lk_inv
+ add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo
+ vld1.64 {q12,q13}, [r10]! @ Lk_sb1
+ vld1.64 {q14,q15}, [r10] @ Lk_sb2
+ bx lr
+
+@@
+@@ _aes_encrypt_core
+@@
+@@ AES-encrypt q0.
+@@
+@@ Inputs:
+@@ q0 = input
+@@ q9-q15 as in _vpaes_preheat
+@@ [r2] = scheduled keys
+@@
+@@ Output in q0
+@@ Clobbers q1-q5, r8-r11
+@@ Preserves q6-q8 so you get some local vectors
+@@
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_encrypt_core
+#endif
+.align 4
+_vpaes_encrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+ adr r11, Lk_ipt
+ @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ vld1.64 {q2, q3}, [r11]
+ adr r11, Lk_mc_forward+16
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
+ vtbl.8 d3, {q2}, d3
+ vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
+ vtbl.8 d5, {q3}, d1
+ veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Lenc_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Lenc_loop.
+ tst r8, r8
+ b Lenc_entry
+
+.align 4
+Lenc_loop:
+ @ middle of middle round
+ add r10, r11, #0x40
+ vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ vtbl.8 d9, {q13}, d5
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
+ vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ vtbl.8 d1, {q12}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ vtbl.8 d11, {q15}, d5
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ vtbl.8 d5, {q14}, d7
+ vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[]
+ vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ vtbl.8 d7, {q0}, d3
+ veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ @ Write to q5 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ vtbl.8 d11, {q0}, d9
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ vtbl.8 d9, {q3}, d3
+ @ Here we restore the original q0/q5 usage.
+ veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ subs r8, r8, #1 @ nr--
+
+Lenc_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ vtbl.8 d11, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
+ bne Lenc_loop
+
+ @ middle of last round
+ add r10, r11, #0x80
+
+ adr r11, Lk_sbo
+ @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+ @ overlap table and destination registers.
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
+ vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[]
+ @ Write to q2 instead of q0 below, to avoid overlapping table and
+ @ destination registers.
+ vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ vtbl.8 d5, {q0}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ @ Here we restore the original q0/q2 usage.
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
+ vtbl.8 d1, {q2}, d3
+ bx lr
+
+
+.globl _vpaes_encrypt
+.private_extern _vpaes_encrypt
+#ifdef __thumb2__
+.thumb_func _vpaes_encrypt
+#endif
+.align 4
+_vpaes_encrypt:
+ @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+ @ alignment.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_encrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+
+
+@
+@ Decryption stuff
+@
+
+.align 4
+_vpaes_decrypt_consts:
+Lk_dipt:@ decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:@ decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+
+@@
+@@ Decryption core
+@@
+@@ Same API as encryption core, except it clobbers q12-q15 rather than using
+@@ the values from _vpaes_preheat. q9-q11 must still be set from
+@@ _vpaes_preheat.
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_decrypt_core
+#endif
+.align 4
+_vpaes_decrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+
+ @ This function performs shuffles with various constants. The x86_64
+ @ version loads them on-demand into %xmm0-%xmm5. This does not work well
+ @ for ARMv7 because those registers are shuffle destinations. The ARMv8
+ @ version preloads those constants into registers, but ARMv7 has half
+ @ the registers to work with. Instead, we load them on-demand into
+ @ q12-q15, registers normally use for preloaded constants. This is fine
+ @ because decryption doesn't use those constants. The values are
+ @ constant, so this does not interfere with potential 2x optimizations.
+ adr r7, Lk_dipt
+
+ vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo
+ lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
+ eor r11, r11, #0x30 @ xor $0x30, %r11
+ adr r10, Lk_sr
+ and r11, r11, #0x30 @ and $0x30, %r11
+ add r11, r11, r10
+ adr r10, Lk_mc_forward+48
+
+ vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q12}, d3
+ vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5
+ @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q13}, d1
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Ldec_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Ldec_loop.
+ tst r8, r8
+ b Ldec_entry
+
+.align 4
+Ldec_loop:
+@
+@ Inverse mix columns
+@
+
+ @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+ @ the function.
+ adr r10, Lk_dsb9
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ vtbl.8 d9, {q12}, d5
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ vtbl.8 d3, {q13}, d7
+ veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
+
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ @ Load sbb* ahead of time.
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ vtbl.8 d3, {q15}, d7
+ @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
+
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ vtbl.8 d9, {q12}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ vtbl.8 d3, {q13}, d7
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ vtbl.8 d3, {q15}, d7
+ vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ subs r8, r8, #1 @ sub $1,%rax # nr--
+
+Ldec_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
+ bne Ldec_loop
+
+ @ middle of last round
+
+ adr r10, Lk_dsbo
+
+ @ Write to q1 rather than q4 to avoid overlapping table and destination.
+ vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ @ Write to q2 rather than q1 to avoid overlapping table and destination.
+ vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q2}, d7
+ vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160
+ veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ @ Write to q1 rather than q0 so the table and destination registers
+ @ below do not overlap.
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
+ vtbl.8 d1, {q1}, d5
+ bx lr
+
+
+.globl _vpaes_decrypt
+.private_extern _vpaes_decrypt
+#ifdef __thumb2__
+.thumb_func _vpaes_decrypt
+#endif
+.align 4
+_vpaes_decrypt:
+ @ _vpaes_decrypt_core uses r7-r11.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_decrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ @@
+@@ AES key schedule @@
+@@ @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@ Key schedule constants
+@
+
+.align 4
+_vpaes_key_consts:
+Lk_dksd:@ decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:@ decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:@ decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:@ rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:@ output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+#ifdef __thumb2__
+.thumb_func _vpaes_key_preheat
+#endif
+.align 4
+_vpaes_key_preheat:
+ adr r11, Lk_rcon
+ vmov.i8 q12, #0x5b @ Lk_s63
+ adr r10, Lk_inv @ Must be aligned to 8 mod 16.
+ vmov.i8 q9, #0x0f @ Lk_s0F
+ vld1.64 {q10,q11}, [r10] @ Lk_inv
+ vld1.64 {q8}, [r11] @ Lk_rcon
+ bx lr
+
+
+#ifdef __thumb2__
+.thumb_func _vpaes_schedule_core
+#endif
+.align 4
+_vpaes_schedule_core:
+ @ We only need to save lr, but ARM requires an 8-byte stack alignment,
+ @ so save an extra register.
+ stmdb sp!, {r3,lr}
+
+ bl _vpaes_key_preheat @ load the tables
+
+ adr r11, Lk_ipt @ Must be aligned to 8 mod 16.
+ vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ @ input transform
+ @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+ @ overlap table and destination.
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ adr r10, Lk_sr @ Must be aligned to 8 mod 16.
+ vmov q7, q0 @ vmovdqa %xmm0, %xmm7
+
+ add r8, r8, r10
+ tst r3, r3
+ bne Lschedule_am_decrypting
+
+ @ encrypting, output zeroth round key after transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
+ b Lschedule_go
+
+Lschedule_am_decrypting:
+ @ decrypting, output zeroth round key after shiftrows
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q4}, d3
+ vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
+ eor r8, r8, #0x30 @ xor $0x30, %r8
+
+Lschedule_go:
+ cmp r1, #192 @ cmp $192, %esi
+ bhi Lschedule_256
+ beq Lschedule_192
+ @ 128: fall though
+
+@@
+@@ .schedule_128
+@@
+@@ 128-bit specific part of key schedule.
+@@
+@@ This schedule is really simple, because all its parts
+@@ are accomplished by the subroutines.
+@@
+Lschedule_128:
+ mov r0, #10 @ mov $10, %esi
+
+Loop_schedule_128:
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ write output
+ b Loop_schedule_128
+
+@@
+@@ .aes_schedule_192
+@@
+@@ 192-bit specific part of key schedule.
+@@
+@@ The main body of this schedule is the same as the 128-bit
+@@ schedule, but with more smearing. The long, high side is
+@@ stored in q7 as before, and the short, low side is in
+@@ the high bits of q6.
+@@
+@@ This schedule is somewhat nastier, however, because each
+@@ round produces 192 bits of key material, or 1.5 round keys.
+@@ Therefore, on each cycle we do 2 rounds and produce 3 round
+@@ keys.
+@@
+.align 4
+Lschedule_192:
+ sub r0, r0, #8
+ vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
+ vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov r0, #4 @ mov $4, %esi
+
+Loop_schedule_192:
+ bl _vpaes_schedule_round
+ vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle @ save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle @ save key n+1
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ save key n+2
+ bl _vpaes_schedule_192_smear
+ b Loop_schedule_192
+
+@@
+@@ .aes_schedule_256
+@@
+@@ 256-bit specific part of key schedule.
+@@
+@@ The structure here is very similar to the 128-bit
+@@ schedule, but with an additional "low side" in
+@@ q6. The low side's rounds are the same as the
+@@ high side's, except no rcon and no rotation.
+@@
+.align 4
+Lschedule_256:
+ vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ mov r0, #7 @ mov $7, %esi
+
+Loop_schedule_256:
+ bl _vpaes_schedule_mangle @ output low result
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ @ high round
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ @ low round. swap xmm7 and xmm6
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vmov.i8 q4, #0
+ vmov q5, q7 @ vmovdqa %xmm7, %xmm5
+ vmov q7, q6 @ vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ vmov q7, q5 @ vmovdqa %xmm5, %xmm7
+
+ b Loop_schedule_256
+
+@@
+@@ .aes_schedule_mangle_last
+@@
+@@ Mangler for last round of key schedule
+@@ Mangles q0
+@@ when encrypting, outputs out(q0) ^ 63
+@@ when decrypting, outputs unskew(q0)
+@@
+@@ Always called right before return... jumps to cleanup and exits
+@@
+.align 4
+Lschedule_mangle_last:
+ @ schedule last round key from xmm0
+ adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew
+ tst r3, r3
+ bne Lschedule_mangle_last_dec
+
+ @ encrypting
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
+ adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform
+ add r2, r2, #32 @ add $32, %rdx
+ vmov q2, q0
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
+ vtbl.8 d1, {q2}, d3
+
+Lschedule_mangle_last_dec:
+ sub r2, r2, #16 @ add $-16, %rdx
+ veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform @ output transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
+
+ @ cleanup
+ veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
+ veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
+ veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
+ veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
+ veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
+ veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
+ ldmia sp!, {r3,pc} @ return
+
+
+@@
+@@ .aes_schedule_192_smear
+@@
+@@ Smear the short, low side in the 192-bit key schedule.
+@@
+@@ Inputs:
+@@ q7: high side, b a x y
+@@ q6: low side, d c 0 0
+@@
+@@ Outputs:
+@@ q6: b+c+d b+c 0 0
+@@ q0: b+c+d b+c b a
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_schedule_192_smear
+#endif
+.align 4
+_vpaes_schedule_192_smear:
+ vmov.i8 q1, #0
+ vdup.32 q0, d15[1]
+ vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ vmov q0, q6 @ vmovdqa %xmm6, %xmm0
+ vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ bx lr
+
+
+@@
+@@ .aes_schedule_round
+@@
+@@ Runs one main round of the key schedule on q0, q7
+@@
+@@ Specifically, runs subbytes on the high dword of q0
+@@ then rotates it by one byte and xors into the low dword of
+@@ q7.
+@@
+@@ Adds rcon from low byte of q8, then rotates q8 for
+@@ next rcon.
+@@
+@@ Smears the dwords of q7 by xoring the low into the
+@@ second low, result into third, result into highest.
+@@
+@@ Returns results in q7 = q0.
+@@ Clobbers q1-q4, r11.
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_schedule_round
+#endif
+.align 4
+_vpaes_schedule_round:
+ @ extract rcon from xmm8
+ vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
+ vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
+ vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+
+ @ rotate
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ @ fall through...
+
+ @ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+ @ We pin other values in _vpaes_key_preheat, so load them now.
+ adr r11, Lk_sb1
+ vld1.64 {q14,q15}, [r11]
+
+ @ smear xmm7
+ vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+ vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
+
+ @ subbytes
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7
+ vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ vtbl.8 d7, {q10}, d7
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ vtbl.8 d5, {q10}, d9
+ veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q15}, d7
+ vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q14}, d5
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ @ add in smeared stuff
+ veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
+ veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
+ bx lr
+
+
+@@
+@@ .aes_schedule_transform
+@@
+@@ Linear-transform q0 according to tables at [r11]
+@@
+@@ Requires that q9 = 0x0F0F... as in preheat
+@@ Output in q0
+@@ Clobbers q1, q2, q14, q15
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_schedule_transform
+#endif
+.align 4
+_vpaes_schedule_transform:
+ vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
+ @ vmovdqa 16(%r11), %xmm1 # hi
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d3
+ vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q15}, d1
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+ bx lr
+
+
+@@
+@@ .aes_schedule_mangle
+@@
+@@ Mangles q0 from (basis-transformed) standard version
+@@ to our version.
+@@
+@@ On encrypt,
+@@ xor with 0x63
+@@ multiply by circulant 0,1,1,1
+@@ apply shiftrows transform
+@@
+@@ On decrypt,
+@@ xor with 0x63
+@@ multiply by "inverse mixcolumns" circulant E,B,D,9
+@@ deskew
+@@ apply shiftrows transform
+@@
+@@
+@@ Writes out to [r2], and increments or decrements it
+@@ Keeps track of round number mod 4 in r8
+@@ Preserves q0
+@@ Clobbers q1-q5
+@@
+#ifdef __thumb2__
+.thumb_func _vpaes_schedule_mangle
+#endif
+.align 4
+_vpaes_schedule_mangle:
+ tst r3, r3
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16.
+ vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5
+ bne Lschedule_mangle_dec
+
+ @ encrypting
+ @ Write to q2 so we do not overlap table and destination below.
+ veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4
+ add r2, r2, #16 @ add $16, %rdx
+ vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
+ vtbl.8 d9, {q2}, d11
+ vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
+ vtbl.8 d3, {q4}, d11
+ vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
+ vtbl.8 d7, {q1}, d11
+ veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
+
+ b Lschedule_mangle_both
+.align 4
+Lschedule_mangle_dec:
+ @ inverse mix columns
+ adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11
+ vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
+ @ vmovdqa 0x10(%r11), %xmm3
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dksb ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
+ @ vmovdqa 0x30(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
+ @ vmovdqa 0x50(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
+ @ vmovdqa 0x70(%r11), %xmm4
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+ vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
+ vtbl.8 d9, {q15}, d3
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
+
+ sub r2, r2, #16 @ add $-16, %rdx
+
+Lschedule_mangle_both:
+ @ Write to q2 so table and destination do not overlap.
+ vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d5, {q3}, d3
+ add r8, r8, #64-16 @ add $-16, %r8
+ and r8, r8, #~(1<<6) @ and $0x30, %r8
+ vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
+ bx lr
+
+
+.globl _vpaes_set_encrypt_key
+.private_extern _vpaes_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func _vpaes_set_encrypt_key
+#endif
+.align 4
+_vpaes_set_encrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov r3, #0 @ mov $0,%ecx
+ mov r8, #0x30 @ mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor r0, r0, r0
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+
+
+.globl _vpaes_set_decrypt_key
+.private_extern _vpaes_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func _vpaes_set_decrypt_key
+#endif
+.align 4
+_vpaes_set_decrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl r9, r9, #4 @ shl $4,%eax
+ add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
+ add r2, r2, r9
+
+ mov r3, #1 @ mov $1,%ecx
+ lsr r8, r1, #1 @ shr $1,%r8d
+ and r8, r8, #32 @ and $32,%r8d
+ eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+
+
+@ Additional constants for converting to bsaes.
+
+.align 4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@ def u64s_to_u128(x, y):
+@ return x | (y << 64)
+@ def u128_to_u64s(w):
+@ return w & ((1<<64)-1), w >> 64
+@ def get_byte(w, i):
+@ return (w >> (i*8)) & 0xff
+@ def apply_table(table, b):
+@ lo = b & 0xf
+@ hi = b >> 4
+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@ def opt(b):
+@ table = [
+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@ ]
+@ return apply_table(table, b)
+@ def rot_byte(b, n):
+@ return 0xff & ((b << n) | (b >> (8-n)))
+@ def skew(x):
+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@ rot_byte(x, 4))
+@ table = [0, 0]
+@ for i in range(16):
+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@ table[1] |= skew(opt(i<<4)) << (i*8)
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+Lk_opt_then_skew:
+.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad 0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+Lk_decrypt_transform:
+.quad 0x0704050603000102, 0x0f0c0d0e0b08090a
+
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl _vpaes_encrypt_key_to_bsaes
+.private_extern _vpaes_encrypt_key_to_bsaes
+#ifdef __thumb2__
+.thumb_func _vpaes_encrypt_key_to_bsaes
+#endif
+.align 4
+_vpaes_encrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. In particular,
+ @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+ @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+ @ contain the transformations not in the bsaes representation. This
+ @ function inverts those transforms.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+ adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16.
+ add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+ vld1.64 {q12}, [r2]
+ vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64
+ adr r11, Lk_opt @ Must be aligned to 8 mod 16.
+ vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+ @ Invert this with .Lk_opt.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+ @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+ @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+Loop_enc_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+ @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+ @ We use r3 rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq Loop_enc_key_to_bsaes_last
+
+ @ Multiply by the circulant. This is its own inverse.
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+ vmov q0, q1
+ vtbl.8 d4, {q1}, d24
+ vtbl.8 d5, {q1}, d25
+ veor q0, q0, q2
+ vtbl.8 d2, {q2}, d24
+ vtbl.8 d3, {q2}, d25
+ veor q0, q0, q1
+
+ @ XOR and finish.
+ veor q0, q0, q10
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+ b Loop_enc_key_to_bsaes
+
+Loop_enc_key_to_bsaes_last:
+ @ The final key does not have a basis transform (note
+ @ .Lschedule_mangle_last inverts the original transform). It only XORs
+ @ 0x63 and applies ShiftRows. The latter was already inverted in the
+ @ loop. Note that, because we act on the original representation, we use
+ @ q11, not q10.
+ veor q0, q0, q11
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl _vpaes_decrypt_key_to_bsaes
+.private_extern _vpaes_decrypt_key_to_bsaes
+#ifdef __thumb2__
+.thumb_func _vpaes_decrypt_key_to_bsaes
+#endif
+.align 4
+_vpaes_decrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+ @ computes the decryption key schedule in reverse. Additionally,
+ @ aes-x86_64.pl shares some transformations, so we must only partially
+ @ invert vpaes's transformations. In general, vpaes computes in a
+ @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+ @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+ @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ adr r2, Lk_decrypt_transform
+ adr r3, Lk_sr+0x30
+ adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
+ vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ Undo the basis change and reapply the S-box affine transform. See
+ @ .Lschedule_mangle_last.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+ @ it simultaneously inverts MixColumns and the S-box affine transform.
+ @ See .Lk_dksd through .Lk_dks9.
+Loop_dec_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+ @ forwards cancels inverting for which direction we cycle r3. We use r3
+ @ rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #64-16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq Loop_dec_key_to_bsaes_last
+
+ @ Undo the basis change and reapply the S-box affine transform.
+ bl _vpaes_schedule_transform
+
+ @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+ @ combine the two operations in .Lk_decrypt_transform.
+ @
+ @ TODO(davidben): Where does the rotation come from?
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+
+ vst1.64 {q1}, [r0]!
+ b Loop_dec_key_to_bsaes
+
+Loop_dec_key_to_bsaes_last:
+ @ The final key only inverts ShiftRows (already done in the loop). See
+ @ .Lschedule_am_decrypting. Its basis is not transformed.
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+
+.globl _vpaes_ctr32_encrypt_blocks
+.private_extern _vpaes_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func _vpaes_ctr32_encrypt_blocks
+#endif
+.align 4
+_vpaes_ctr32_encrypt_blocks:
+ mov ip, sp
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ @ This function uses q4-q7 (d8-d15), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ cmp r2, #0
+ @ r8 is passed on the stack.
+ ldr r8, [ip]
+ beq Lctr32_done
+
+ @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+ mov r9, r3
+ mov r3, r2
+ mov r2, r9
+
+ @ Load the IV and counter portion.
+ ldr r7, [r8, #12]
+ vld1.8 {q7}, [r8]
+
+ bl _vpaes_preheat
+ rev r7, r7 @ The counter is big-endian.
+
+Lctr32_loop:
+ vmov q0, q7
+ vld1.8 {q6}, [r0]! @ Load input ahead of time
+ bl _vpaes_encrypt_core
+ veor q0, q0, q6 @ XOR input and result
+ vst1.8 {q0}, [r1]!
+ subs r3, r3, #1
+ @ Update the counter.
+ add r7, r7, #1
+ rev r9, r7
+ vmov.32 d15[1], r9
+ bne Lctr32_loop
+
+Lctr32_done:
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+
+#endif // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/test/trampoline-armv4.S b/apple-arm/crypto/test/trampoline-armv4.S
new file mode 100644
index 0000000..9d74f55
--- /dev/null
+++ b/apple-arm/crypto/test/trampoline-armv4.S
@@ -0,0 +1,376 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax unified
+
+
+
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@ const uint32_t *argv, size_t argc,
+@ int unwind);
+
+.globl _abi_test_trampoline
+.private_extern _abi_test_trampoline
+.align 4
+_abi_test_trampoline:
+ @ Save parameters and all callee-saved registers. For convenience, we
+ @ save r9 on iOS even though it's volatile.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+ @ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+ @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+ sub sp, sp, #28
+
+ @ Every register in AAPCS is either non-volatile or a parameter (except
+ @ r9 on iOS), so this code, by the actual call, loses all its scratch
+ @ registers. First fill in stack parameters while there are registers
+ @ to spare.
+ cmp r3, #4
+ bls Lstack_args_done
+ mov r4, sp @ r4 is the output pointer.
+ add r5, r2, r3, lsl #2 @ Set r5 to the end of argv.
+ add r2, r2, #16 @ Skip four arguments.
+Lstack_args_loop:
+ ldr r6, [r2], #4
+ cmp r2, r5
+ str r6, [r4], #4
+ bne Lstack_args_loop
+
+Lstack_args_done:
+ @ Load registers from |r1|.
+ vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ ldmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Load register parameters. This uses up our remaining registers, so we
+ @ repurpose lr as scratch space.
+ ldr r3, [sp, #40] @ Reload argc.
+ ldr lr, [sp, #36] @ Load argv into lr.
+ cmp r3, #3
+ bhi Larg_r3
+ beq Larg_r2
+ cmp r3, #1
+ bhi Larg_r1
+ beq Larg_r0
+ b Largs_done
+
+Larg_r3:
+ ldr r3, [lr, #12] @ argv[3]
+Larg_r2:
+ ldr r2, [lr, #8] @ argv[2]
+Larg_r1:
+ ldr r1, [lr, #4] @ argv[1]
+Larg_r0:
+ ldr r0, [lr] @ argv[0]
+Largs_done:
+
+ @ With every other register in use, load the function pointer into lr
+ @ and call the function.
+ ldr lr, [sp, #28]
+ blx lr
+
+ @ r1-r3 are free for use again. The trampoline only supports
+ @ single-return functions. Pass r4-r11 to the caller.
+ ldr r1, [sp, #32]
+ vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ stmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Unwind the stack and restore registers.
+ add sp, sp, #44 @ 44 = 28+16
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above).
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ bx lr
+
+
+.globl _abi_test_clobber_r0
+.private_extern _abi_test_clobber_r0
+.align 4
+_abi_test_clobber_r0:
+ mov r0, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r1
+.private_extern _abi_test_clobber_r1
+.align 4
+_abi_test_clobber_r1:
+ mov r1, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r2
+.private_extern _abi_test_clobber_r2
+.align 4
+_abi_test_clobber_r2:
+ mov r2, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r3
+.private_extern _abi_test_clobber_r3
+.align 4
+_abi_test_clobber_r3:
+ mov r3, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r4
+.private_extern _abi_test_clobber_r4
+.align 4
+_abi_test_clobber_r4:
+ mov r4, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r5
+.private_extern _abi_test_clobber_r5
+.align 4
+_abi_test_clobber_r5:
+ mov r5, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r6
+.private_extern _abi_test_clobber_r6
+.align 4
+_abi_test_clobber_r6:
+ mov r6, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r7
+.private_extern _abi_test_clobber_r7
+.align 4
+_abi_test_clobber_r7:
+ mov r7, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r8
+.private_extern _abi_test_clobber_r8
+.align 4
+_abi_test_clobber_r8:
+ mov r8, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r9
+.private_extern _abi_test_clobber_r9
+.align 4
+_abi_test_clobber_r9:
+ mov r9, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r10
+.private_extern _abi_test_clobber_r10
+.align 4
+_abi_test_clobber_r10:
+ mov r10, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r11
+.private_extern _abi_test_clobber_r11
+.align 4
+_abi_test_clobber_r11:
+ mov r11, #0
+ bx lr
+
+
+.globl _abi_test_clobber_r12
+.private_extern _abi_test_clobber_r12
+.align 4
+_abi_test_clobber_r12:
+ mov r12, #0
+ bx lr
+
+
+.globl _abi_test_clobber_d0
+.private_extern _abi_test_clobber_d0
+.align 4
+_abi_test_clobber_d0:
+ mov r0, #0
+ vmov s0, r0
+ vmov s1, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d1
+.private_extern _abi_test_clobber_d1
+.align 4
+_abi_test_clobber_d1:
+ mov r0, #0
+ vmov s2, r0
+ vmov s3, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d2
+.private_extern _abi_test_clobber_d2
+.align 4
+_abi_test_clobber_d2:
+ mov r0, #0
+ vmov s4, r0
+ vmov s5, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d3
+.private_extern _abi_test_clobber_d3
+.align 4
+_abi_test_clobber_d3:
+ mov r0, #0
+ vmov s6, r0
+ vmov s7, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d4
+.private_extern _abi_test_clobber_d4
+.align 4
+_abi_test_clobber_d4:
+ mov r0, #0
+ vmov s8, r0
+ vmov s9, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d5
+.private_extern _abi_test_clobber_d5
+.align 4
+_abi_test_clobber_d5:
+ mov r0, #0
+ vmov s10, r0
+ vmov s11, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d6
+.private_extern _abi_test_clobber_d6
+.align 4
+_abi_test_clobber_d6:
+ mov r0, #0
+ vmov s12, r0
+ vmov s13, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d7
+.private_extern _abi_test_clobber_d7
+.align 4
+_abi_test_clobber_d7:
+ mov r0, #0
+ vmov s14, r0
+ vmov s15, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d8
+.private_extern _abi_test_clobber_d8
+.align 4
+_abi_test_clobber_d8:
+ mov r0, #0
+ vmov s16, r0
+ vmov s17, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d9
+.private_extern _abi_test_clobber_d9
+.align 4
+_abi_test_clobber_d9:
+ mov r0, #0
+ vmov s18, r0
+ vmov s19, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d10
+.private_extern _abi_test_clobber_d10
+.align 4
+_abi_test_clobber_d10:
+ mov r0, #0
+ vmov s20, r0
+ vmov s21, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d11
+.private_extern _abi_test_clobber_d11
+.align 4
+_abi_test_clobber_d11:
+ mov r0, #0
+ vmov s22, r0
+ vmov s23, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d12
+.private_extern _abi_test_clobber_d12
+.align 4
+_abi_test_clobber_d12:
+ mov r0, #0
+ vmov s24, r0
+ vmov s25, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d13
+.private_extern _abi_test_clobber_d13
+.align 4
+_abi_test_clobber_d13:
+ mov r0, #0
+ vmov s26, r0
+ vmov s27, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d14
+.private_extern _abi_test_clobber_d14
+.align 4
+_abi_test_clobber_d14:
+ mov r0, #0
+ vmov s28, r0
+ vmov s29, r0
+ bx lr
+
+
+.globl _abi_test_clobber_d15
+.private_extern _abi_test_clobber_d15
+.align 4
+_abi_test_clobber_d15:
+ mov r0, #0
+ vmov s30, r0
+ vmov s31, r0
+ bx lr
+
+#endif // !OPENSSL_NO_ASM
diff --git a/mac-x86/crypto/chacha/chacha-x86.S b/apple-x86/crypto/chacha/chacha-x86.S
similarity index 100%
rename from mac-x86/crypto/chacha/chacha-x86.S
rename to apple-x86/crypto/chacha/chacha-x86.S
diff --git a/mac-x86/crypto/fipsmodule/aesni-x86.S b/apple-x86/crypto/fipsmodule/aesni-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/aesni-x86.S
rename to apple-x86/crypto/fipsmodule/aesni-x86.S
diff --git a/mac-x86/crypto/fipsmodule/bn-586.S b/apple-x86/crypto/fipsmodule/bn-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/bn-586.S
rename to apple-x86/crypto/fipsmodule/bn-586.S
diff --git a/mac-x86/crypto/fipsmodule/co-586.S b/apple-x86/crypto/fipsmodule/co-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/co-586.S
rename to apple-x86/crypto/fipsmodule/co-586.S
diff --git a/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S
rename to apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
diff --git a/mac-x86/crypto/fipsmodule/ghash-x86.S b/apple-x86/crypto/fipsmodule/ghash-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/ghash-x86.S
rename to apple-x86/crypto/fipsmodule/ghash-x86.S
diff --git a/mac-x86/crypto/fipsmodule/md5-586.S b/apple-x86/crypto/fipsmodule/md5-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/md5-586.S
rename to apple-x86/crypto/fipsmodule/md5-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha1-586.S b/apple-x86/crypto/fipsmodule/sha1-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha1-586.S
rename to apple-x86/crypto/fipsmodule/sha1-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha256-586.S b/apple-x86/crypto/fipsmodule/sha256-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha256-586.S
rename to apple-x86/crypto/fipsmodule/sha256-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha512-586.S b/apple-x86/crypto/fipsmodule/sha512-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha512-586.S
rename to apple-x86/crypto/fipsmodule/sha512-586.S
diff --git a/mac-x86/crypto/fipsmodule/vpaes-x86.S b/apple-x86/crypto/fipsmodule/vpaes-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/vpaes-x86.S
rename to apple-x86/crypto/fipsmodule/vpaes-x86.S
diff --git a/mac-x86/crypto/fipsmodule/x86-mont.S b/apple-x86/crypto/fipsmodule/x86-mont.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/x86-mont.S
rename to apple-x86/crypto/fipsmodule/x86-mont.S
diff --git a/mac-x86/crypto/test/trampoline-x86.S b/apple-x86/crypto/test/trampoline-x86.S
similarity index 100%
rename from mac-x86/crypto/test/trampoline-x86.S
rename to apple-x86/crypto/test/trampoline-x86.S
diff --git a/mac-x86_64/crypto/chacha/chacha-x86_64.S b/apple-x86_64/crypto/chacha/chacha-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/chacha/chacha-x86_64.S
rename to apple-x86_64/crypto/chacha/chacha-x86_64.S
diff --git a/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
rename to apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
diff --git a/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
rename to apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S b/apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/aesni-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S b/apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/ghash-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/md5-x86_64.S b/apple-x86_64/crypto/fipsmodule/md5-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/md5-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/md5-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
rename to apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
diff --git a/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
rename to apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
diff --git a/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/rsaz-avx2.S
rename to apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
diff --git a/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/sha1-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
similarity index 93%
rename from mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
index d94268d..00dc01c 100644
--- a/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
+++ b/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -24,6 +24,8 @@
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz L$shaext_shortcut
andl $1073741824,%r9d
andl $268435968,%r10d
orl %r9d,%r10d
@@ -1782,6 +1784,215 @@
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.p2align 6
+sha256_block_data_order_shaext:
+
+L$shaext_shortcut:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp L$oop_shaext
+
+.p2align 4
+L$oop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz L$oop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ .byte 0xf3,0xc3
+
+
+
+.p2align 6
sha256_block_data_order_ssse3:
L$ssse3_shortcut:
diff --git a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/x86_64-mont.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/x86_64-mont.S
rename to apple-x86_64/crypto/fipsmodule/x86_64-mont.S
diff --git a/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/x86_64-mont5.S
rename to apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
diff --git a/mac-x86_64/crypto/test/trampoline-x86_64.S b/apple-x86_64/crypto/test/trampoline-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/test/trampoline-x86_64.S
rename to apple-x86_64/crypto/test/trampoline-x86_64.S
diff --git a/err_data.c b/err_data.c
index 8432081..de52cc0 100644
--- a/err_data.c
+++ b/err_data.c
@@ -193,50 +193,50 @@
0x283480b9,
0x283500f7,
0x28358c94,
- 0x2c323286,
+ 0x2c323284,
0x2c32932e,
- 0x2c333294,
- 0x2c33b2a6,
- 0x2c3432ba,
- 0x2c34b2cc,
- 0x2c3532e7,
- 0x2c35b2f9,
- 0x2c363329,
+ 0x2c333292,
+ 0x2c33b2a4,
+ 0x2c3432b8,
+ 0x2c34b2ca,
+ 0x2c3532e5,
+ 0x2c35b2f7,
+ 0x2c363327,
0x2c36833a,
- 0x2c373336,
- 0x2c37b362,
- 0x2c383387,
- 0x2c38b39e,
- 0x2c3933bc,
- 0x2c39b3cc,
- 0x2c3a33de,
- 0x2c3ab3f2,
- 0x2c3b3403,
- 0x2c3bb422,
+ 0x2c373334,
+ 0x2c37b360,
+ 0x2c383385,
+ 0x2c38b39c,
+ 0x2c3933ba,
+ 0x2c39b3ca,
+ 0x2c3a33dc,
+ 0x2c3ab3f0,
+ 0x2c3b3401,
+ 0x2c3bb420,
0x2c3c1340,
0x2c3c9356,
- 0x2c3d3467,
+ 0x2c3d3465,
0x2c3d936f,
- 0x2c3e3491,
- 0x2c3eb49f,
- 0x2c3f34b7,
- 0x2c3fb4cf,
- 0x2c4034f9,
+ 0x2c3e348f,
+ 0x2c3eb49d,
+ 0x2c3f34b5,
+ 0x2c3fb4cd,
+ 0x2c4034f7,
0x2c409241,
- 0x2c41350a,
- 0x2c41b51d,
+ 0x2c413508,
+ 0x2c41b51b,
0x2c421207,
- 0x2c42b52e,
+ 0x2c42b52c,
0x2c43074a,
- 0x2c43b414,
- 0x2c443375,
- 0x2c44b4dc,
- 0x2c45330c,
- 0x2c45b348,
- 0x2c4633ac,
- 0x2c46b436,
- 0x2c47344b,
- 0x2c47b484,
+ 0x2c43b412,
+ 0x2c443373,
+ 0x2c44b4da,
+ 0x2c45330a,
+ 0x2c45b346,
+ 0x2c4633aa,
+ 0x2c46b434,
+ 0x2c473449,
+ 0x2c47b482,
0x30320000,
0x30328015,
0x3033001f,
@@ -433,158 +433,158 @@
0x404ea057,
0x404f20f1,
0x404fa167,
- 0x405021be,
- 0x4050a1d2,
- 0x40512205,
- 0x40522215,
- 0x4052a239,
- 0x40532251,
- 0x4053a264,
- 0x40542279,
- 0x4054a29c,
- 0x405522c7,
- 0x4055a304,
- 0x40562329,
- 0x4056a342,
- 0x4057235a,
- 0x4057a36d,
- 0x40582382,
- 0x4058a3a9,
- 0x405923d8,
- 0x4059a405,
- 0x405a2419,
- 0x405aa429,
- 0x405b2441,
- 0x405ba452,
- 0x405c2465,
- 0x405ca4a4,
- 0x405d24b1,
- 0x405da4d6,
- 0x405e2514,
+ 0x405021d6,
+ 0x4050a1ea,
+ 0x4051221d,
+ 0x4052222d,
+ 0x4052a251,
+ 0x40532269,
+ 0x4053a27c,
+ 0x40542291,
+ 0x4054a2b4,
+ 0x405522df,
+ 0x4055a31c,
+ 0x40562341,
+ 0x4056a35a,
+ 0x40572372,
+ 0x4057a385,
+ 0x4058239a,
+ 0x4058a3c1,
+ 0x405923f0,
+ 0x4059a41d,
+ 0x405a2431,
+ 0x405aa441,
+ 0x405b2459,
+ 0x405ba46a,
+ 0x405c247d,
+ 0x405ca4bc,
+ 0x405d24c9,
+ 0x405da4ee,
+ 0x405e252c,
0x405e8adb,
- 0x405f254f,
- 0x405fa55c,
- 0x4060256a,
- 0x4060a58c,
- 0x406125ed,
- 0x4061a625,
- 0x4062263c,
- 0x4062a64d,
- 0x4063269a,
- 0x4063a6af,
- 0x406426c6,
- 0x4064a6f2,
- 0x4065270d,
- 0x4065a724,
- 0x4066273c,
- 0x4066a766,
- 0x40672791,
- 0x4067a7d6,
- 0x4068281e,
- 0x4068a83f,
- 0x40692871,
- 0x4069a89f,
- 0x406a28c0,
- 0x406aa8e0,
- 0x406b2a68,
- 0x406baa8b,
- 0x406c2aa1,
- 0x406cadab,
- 0x406d2dda,
- 0x406dae02,
- 0x406e2e30,
- 0x406eae7d,
- 0x406f2ed6,
- 0x406faf0e,
- 0x40702f21,
- 0x4070af3e,
+ 0x405f254d,
+ 0x405fa55a,
+ 0x40602568,
+ 0x4060a58a,
+ 0x406125eb,
+ 0x4061a623,
+ 0x4062263a,
+ 0x4062a64b,
+ 0x40632698,
+ 0x4063a6ad,
+ 0x406426c4,
+ 0x4064a6f0,
+ 0x4065270b,
+ 0x4065a722,
+ 0x4066273a,
+ 0x4066a764,
+ 0x4067278f,
+ 0x4067a7d4,
+ 0x4068281c,
+ 0x4068a83d,
+ 0x4069286f,
+ 0x4069a89d,
+ 0x406a28be,
+ 0x406aa8de,
+ 0x406b2a66,
+ 0x406baa89,
+ 0x406c2a9f,
+ 0x406cada9,
+ 0x406d2dd8,
+ 0x406dae00,
+ 0x406e2e2e,
+ 0x406eae7b,
+ 0x406f2ed4,
+ 0x406faf0c,
+ 0x40702f1f,
+ 0x4070af3c,
0x4071082a,
- 0x4071af50,
- 0x40722f63,
- 0x4072af99,
- 0x40732fb1,
+ 0x4071af4e,
+ 0x40722f61,
+ 0x4072af97,
+ 0x40732faf,
0x40739540,
- 0x40742fc5,
- 0x4074afdf,
- 0x40752ff0,
- 0x4075b004,
- 0x40763012,
+ 0x40742fc3,
+ 0x4074afdd,
+ 0x40752fee,
+ 0x4075b002,
+ 0x40763010,
0x40769304,
- 0x40773037,
- 0x4077b077,
- 0x40783092,
- 0x4078b0cb,
- 0x407930e2,
- 0x4079b0f8,
- 0x407a3124,
- 0x407ab137,
- 0x407b314c,
- 0x407bb15e,
- 0x407c318f,
- 0x407cb198,
- 0x407d285a,
- 0x407da177,
- 0x407e30a7,
- 0x407ea3b9,
+ 0x40773035,
+ 0x4077b075,
+ 0x40783090,
+ 0x4078b0c9,
+ 0x407930e0,
+ 0x4079b0f6,
+ 0x407a3122,
+ 0x407ab135,
+ 0x407b314a,
+ 0x407bb15c,
+ 0x407c318d,
+ 0x407cb196,
+ 0x407d2858,
+ 0x407da18f,
+ 0x407e30a5,
+ 0x407ea3d1,
0x407f1dcb,
0x407f9f9e,
0x40802101,
0x40809df3,
- 0x40812227,
+ 0x4081223f,
0x4081a0a5,
- 0x40822e1b,
+ 0x40822e19,
0x40829b46,
- 0x40832394,
- 0x4083a6d7,
+ 0x408323ac,
+ 0x4083a6d5,
0x40841e07,
- 0x4084a3f1,
- 0x40852476,
- 0x4085a5b4,
- 0x408624f6,
- 0x4086a191,
- 0x40872e61,
- 0x4087a602,
+ 0x4084a409,
+ 0x4085248e,
+ 0x4085a5b2,
+ 0x4086250e,
+ 0x4086a1a9,
+ 0x40872e5f,
+ 0x4087a600,
0x40881b84,
- 0x4088a7e9,
+ 0x4088a7e7,
0x40891bd3,
0x40899b60,
- 0x408a2ad9,
+ 0x408a2ad7,
0x408a9958,
- 0x408b3173,
- 0x408baeeb,
- 0x408c2486,
+ 0x408b3171,
+ 0x408baee9,
+ 0x408c249e,
0x408c9990,
0x408d1eef,
0x408d9e39,
0x408e201f,
- 0x408ea2e4,
- 0x408f27fd,
- 0x408fa5d0,
- 0x409027b2,
- 0x4090a4c8,
- 0x40912ac1,
+ 0x408ea2fc,
+ 0x408f27fb,
+ 0x408fa5ce,
+ 0x409027b0,
+ 0x4090a4e0,
+ 0x40912abf,
0x409199b6,
0x40921c20,
- 0x4092ae9c,
- 0x40932f7c,
- 0x4093a1a2,
+ 0x4092ae9a,
+ 0x40932f7a,
+ 0x4093a1ba,
0x40941e1b,
- 0x4094aaf2,
- 0x4095265e,
- 0x4095b104,
- 0x40962e48,
+ 0x4094aaf0,
+ 0x4095265c,
+ 0x4095b102,
+ 0x40962e46,
0x4096a11a,
- 0x409721ed,
+ 0x40972205,
0x4097a06e,
0x40981c80,
- 0x4098a672,
- 0x40992eb8,
- 0x4099a311,
- 0x409a22aa,
+ 0x4098a670,
+ 0x40992eb6,
+ 0x4099a329,
+ 0x409a22c2,
0x409a9974,
0x409b1e75,
0x409b9ea0,
- 0x409c3059,
+ 0x409c3057,
0x409c9ec8,
0x409d20d6,
0x409da0bb,
@@ -592,42 +592,42 @@
0x409ea14f,
0x409f2137,
0x409f9e68,
- 0x40a02535,
+ 0x40a02177,
0x40a0a088,
- 0x41f42993,
- 0x41f92a25,
- 0x41fe2918,
- 0x41feabce,
- 0x41ff2cfc,
- 0x420329ac,
- 0x420829ce,
- 0x4208aa0a,
- 0x420928fc,
- 0x4209aa44,
- 0x420a2953,
- 0x420aa933,
- 0x420b2973,
- 0x420ba9ec,
- 0x420c2d18,
- 0x420cab02,
- 0x420d2bb5,
- 0x420dabec,
- 0x42122c1f,
- 0x42172cdf,
- 0x4217ac61,
- 0x421c2c83,
- 0x421f2c3e,
- 0x42212d90,
- 0x42262cc2,
- 0x422b2d6e,
- 0x422bab90,
- 0x422c2d50,
- 0x422cab43,
- 0x422d2b1c,
- 0x422dad2f,
- 0x422e2b6f,
- 0x42302c9e,
- 0x4230ac06,
+ 0x41f42991,
+ 0x41f92a23,
+ 0x41fe2916,
+ 0x41feabcc,
+ 0x41ff2cfa,
+ 0x420329aa,
+ 0x420829cc,
+ 0x4208aa08,
+ 0x420928fa,
+ 0x4209aa42,
+ 0x420a2951,
+ 0x420aa931,
+ 0x420b2971,
+ 0x420ba9ea,
+ 0x420c2d16,
+ 0x420cab00,
+ 0x420d2bb3,
+ 0x420dabea,
+ 0x42122c1d,
+ 0x42172cdd,
+ 0x4217ac5f,
+ 0x421c2c81,
+ 0x421f2c3c,
+ 0x42212d8e,
+ 0x42262cc0,
+ 0x422b2d6c,
+ 0x422bab8e,
+ 0x422c2d4e,
+ 0x422cab41,
+ 0x422d2b1a,
+ 0x422dad2d,
+ 0x422e2b6d,
+ 0x42302c9c,
+ 0x4230ac04,
0x44320755,
0x44328764,
0x44330770,
@@ -682,71 +682,71 @@
0x4c41159d,
0x4c419420,
0x4c421589,
- 0x50323540,
- 0x5032b54f,
- 0x5033355a,
- 0x5033b56a,
- 0x50343583,
- 0x5034b59d,
- 0x503535ab,
- 0x5035b5c1,
- 0x503635d3,
- 0x5036b5e9,
- 0x50373602,
- 0x5037b615,
- 0x5038362d,
- 0x5038b63e,
- 0x50393653,
- 0x5039b667,
- 0x503a3687,
- 0x503ab69d,
- 0x503b36b5,
- 0x503bb6c7,
- 0x503c36e3,
- 0x503cb6fa,
- 0x503d3713,
- 0x503db729,
- 0x503e3736,
- 0x503eb74c,
- 0x503f375e,
+ 0x5032353e,
+ 0x5032b54d,
+ 0x50333558,
+ 0x5033b568,
+ 0x50343581,
+ 0x5034b59b,
+ 0x503535a9,
+ 0x5035b5bf,
+ 0x503635d1,
+ 0x5036b5e7,
+ 0x50373600,
+ 0x5037b613,
+ 0x5038362b,
+ 0x5038b63c,
+ 0x50393651,
+ 0x5039b665,
+ 0x503a3685,
+ 0x503ab69b,
+ 0x503b36b3,
+ 0x503bb6c5,
+ 0x503c36e1,
+ 0x503cb6f8,
+ 0x503d3711,
+ 0x503db727,
+ 0x503e3734,
+ 0x503eb74a,
+ 0x503f375c,
0x503f83a3,
- 0x50403771,
- 0x5040b781,
- 0x5041379b,
- 0x5041b7aa,
- 0x504237c4,
- 0x5042b7e1,
- 0x504337f1,
- 0x5043b801,
- 0x5044381e,
+ 0x5040376f,
+ 0x5040b77f,
+ 0x50413799,
+ 0x5041b7a8,
+ 0x504237c2,
+ 0x5042b7df,
+ 0x504337ef,
+ 0x5043b7ff,
+ 0x5044381c,
0x50448459,
- 0x50453832,
- 0x5045b850,
- 0x50463863,
- 0x5046b879,
- 0x5047388b,
- 0x5047b8a0,
- 0x504838c6,
- 0x5048b8d4,
- 0x504938e7,
- 0x5049b8fc,
- 0x504a3912,
- 0x504ab922,
- 0x504b3942,
- 0x504bb955,
- 0x504c3978,
- 0x504cb9a6,
- 0x504d39d3,
- 0x504db9f0,
- 0x504e3a0b,
- 0x504eba27,
- 0x504f3a39,
- 0x504fba50,
- 0x50503a5f,
+ 0x50453830,
+ 0x5045b84e,
+ 0x50463861,
+ 0x5046b877,
+ 0x50473889,
+ 0x5047b89e,
+ 0x504838c4,
+ 0x5048b8d2,
+ 0x504938e5,
+ 0x5049b8fa,
+ 0x504a3910,
+ 0x504ab920,
+ 0x504b3940,
+ 0x504bb953,
+ 0x504c3976,
+ 0x504cb9a4,
+ 0x504d39d1,
+ 0x504db9ee,
+ 0x504e3a09,
+ 0x504eba25,
+ 0x504f3a37,
+ 0x504fba4e,
+ 0x50503a5d,
0x50508719,
- 0x50513a72,
- 0x5051b810,
- 0x505239b8,
+ 0x50513a70,
+ 0x5051b80e,
+ 0x505239b6,
0x58320f8d,
0x68320f4f,
0x68328ca7,
@@ -790,19 +790,19 @@
0x7c32121d,
0x80321433,
0x80328090,
- 0x80333255,
+ 0x80333253,
0x803380b9,
- 0x80343264,
- 0x8034b1cc,
- 0x803531ea,
- 0x8035b278,
- 0x8036322c,
- 0x8036b1db,
- 0x8037321e,
- 0x8037b1b9,
- 0x8038323f,
- 0x8038b1fb,
- 0x80393210,
+ 0x80343262,
+ 0x8034b1ca,
+ 0x803531e8,
+ 0x8035b276,
+ 0x8036322a,
+ 0x8036b1d9,
+ 0x8037321c,
+ 0x8037b1b7,
+ 0x8038323d,
+ 0x8038b1f9,
+ 0x8039320e,
};
const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
@@ -1226,6 +1226,7 @@
"INVALID_ECH_CONFIG_LIST\0"
"INVALID_ECH_PUBLIC_NAME\0"
"INVALID_MESSAGE\0"
+ "INVALID_OUTER_EXTENSION\0"
"INVALID_OUTER_RECORD_TYPE\0"
"INVALID_SCT_LIST\0"
"INVALID_SIGNATURE_ALGORITHM\0"
@@ -1269,7 +1270,6 @@
"OLD_SESSION_CIPHER_NOT_RETURNED\0"
"OLD_SESSION_PRF_HASH_MISMATCH\0"
"OLD_SESSION_VERSION_NOT_RETURNED\0"
- "OUTER_EXTENSION_NOT_FOUND\0"
"PARSE_TLSEXT\0"
"PATH_TOO_LONG\0"
"PEER_DID_NOT_RETURN_A_CERTIFICATE\0"
diff --git a/eureka.mk b/eureka.mk
index ec431f4..93d4437 100644
--- a/eureka.mk
+++ b/eureka.mk
@@ -73,6 +73,7 @@
src/crypto/cipher_extra/e_aesctrhmac.c\
src/crypto/cipher_extra/e_aesgcmsiv.c\
src/crypto/cipher_extra/e_chacha20poly1305.c\
+ src/crypto/cipher_extra/e_des.c\
src/crypto/cipher_extra/e_null.c\
src/crypto/cipher_extra/e_rc2.c\
src/crypto/cipher_extra/e_rc4.c\
@@ -91,6 +92,7 @@
src/crypto/crypto.c\
src/crypto/curve25519/curve25519.c\
src/crypto/curve25519/spake25519.c\
+ src/crypto/des/des.c\
src/crypto/dh_extra/dh_asn1.c\
src/crypto/dh_extra/params.c\
src/crypto/digest_extra/digest_extra.c\
diff --git a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
index a09764a..6ce216f 100644
--- a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -25,6 +25,8 @@
movl 0(%r11),%r9d
movl 4(%r11),%r10d
movl 8(%r11),%r11d
+ testl $536870912,%r11d
+ jnz .Lshaext_shortcut
andl $1073741824,%r9d
andl $268435968,%r10d
orl %r9d,%r10d
@@ -1781,6 +1783,215 @@
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha256_block_data_order_shaext,@function
+.align 64
+sha256_block_data_order_shaext:
+.cfi_startproc
+.Lshaext_shortcut:
+ leaq K256+128(%rip),%rcx
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa 512-128(%rcx),%xmm7
+
+ pshufd $0x1b,%xmm1,%xmm0
+ pshufd $0xb1,%xmm1,%xmm1
+ pshufd $0x1b,%xmm2,%xmm2
+ movdqa %xmm7,%xmm8
+.byte 102,15,58,15,202,8
+ punpcklqdq %xmm0,%xmm2
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ movdqu (%rsi),%xmm3
+ movdqu 16(%rsi),%xmm4
+ movdqu 32(%rsi),%xmm5
+.byte 102,15,56,0,223
+ movdqu 48(%rsi),%xmm6
+
+ movdqa 0-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 102,15,56,0,231
+ movdqa %xmm2,%xmm10
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ nop
+ movdqa %xmm1,%xmm9
+.byte 15,56,203,202
+
+ movdqa 32-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 102,15,56,0,239
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ leaq 64(%rsi),%rsi
+.byte 15,56,204,220
+.byte 15,56,203,202
+
+ movdqa 64-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 102,15,56,0,247
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+
+ movdqa 96-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 128-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 160-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 192-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 224-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 256-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 288-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+ nop
+ paddd %xmm7,%xmm6
+.byte 15,56,204,220
+.byte 15,56,203,202
+ movdqa 320-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,205,245
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm6,%xmm7
+.byte 102,15,58,15,253,4
+ nop
+ paddd %xmm7,%xmm3
+.byte 15,56,204,229
+.byte 15,56,203,202
+ movdqa 352-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+.byte 15,56,205,222
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm3,%xmm7
+.byte 102,15,58,15,254,4
+ nop
+ paddd %xmm7,%xmm4
+.byte 15,56,204,238
+.byte 15,56,203,202
+ movdqa 384-128(%rcx),%xmm0
+ paddd %xmm3,%xmm0
+.byte 15,56,205,227
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm4,%xmm7
+.byte 102,15,58,15,251,4
+ nop
+ paddd %xmm7,%xmm5
+.byte 15,56,204,243
+.byte 15,56,203,202
+ movdqa 416-128(%rcx),%xmm0
+ paddd %xmm4,%xmm0
+.byte 15,56,205,236
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ movdqa %xmm5,%xmm7
+.byte 102,15,58,15,252,4
+.byte 15,56,203,202
+ paddd %xmm7,%xmm6
+
+ movdqa 448-128(%rcx),%xmm0
+ paddd %xmm5,%xmm0
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+.byte 15,56,205,245
+ movdqa %xmm8,%xmm7
+.byte 15,56,203,202
+
+ movdqa 480-128(%rcx),%xmm0
+ paddd %xmm6,%xmm0
+ nop
+.byte 15,56,203,209
+ pshufd $0x0e,%xmm0,%xmm0
+ decq %rdx
+ nop
+.byte 15,56,203,202
+
+ paddd %xmm10,%xmm2
+ paddd %xmm9,%xmm1
+ jnz .Loop_shaext
+
+ pshufd $0xb1,%xmm2,%xmm2
+ pshufd $0x1b,%xmm1,%xmm7
+ pshufd $0xb1,%xmm1,%xmm1
+ punpckhqdq %xmm2,%xmm1
+.byte 102,15,58,15,215,8
+
+ movdqu %xmm1,(%rdi)
+ movdqu %xmm2,16(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
.type sha256_block_data_order_ssse3,@function
.align 64
sha256_block_data_order_ssse3:
diff --git a/sources.bp b/sources.bp
index 704f514..2f3e684 100644
--- a/sources.bp
+++ b/sources.bp
@@ -75,6 +75,7 @@
"src/crypto/cipher_extra/e_aesctrhmac.c",
"src/crypto/cipher_extra/e_aesgcmsiv.c",
"src/crypto/cipher_extra/e_chacha20poly1305.c",
+ "src/crypto/cipher_extra/e_des.c",
"src/crypto/cipher_extra/e_null.c",
"src/crypto/cipher_extra/e_rc2.c",
"src/crypto/cipher_extra/e_rc4.c",
@@ -93,6 +94,7 @@
"src/crypto/crypto.c",
"src/crypto/curve25519/curve25519.c",
"src/crypto/curve25519/spake25519.c",
+ "src/crypto/des/des.c",
"src/crypto/dh_extra/dh_asn1.c",
"src/crypto/dh_extra/params.c",
"src/crypto/digest_extra/digest_extra.c",
diff --git a/sources.mk b/sources.mk
index ebc49c7..397432e 100644
--- a/sources.mk
+++ b/sources.mk
@@ -73,6 +73,7 @@
src/crypto/cipher_extra/e_aesctrhmac.c\
src/crypto/cipher_extra/e_aesgcmsiv.c\
src/crypto/cipher_extra/e_chacha20poly1305.c\
+ src/crypto/cipher_extra/e_des.c\
src/crypto/cipher_extra/e_null.c\
src/crypto/cipher_extra/e_rc2.c\
src/crypto/cipher_extra/e_rc4.c\
@@ -91,6 +92,7 @@
src/crypto/crypto.c\
src/crypto/curve25519/curve25519.c\
src/crypto/curve25519/spake25519.c\
+ src/crypto/des/des.c\
src/crypto/dh_extra/dh_asn1.c\
src/crypto/dh_extra/params.c\
src/crypto/digest_extra/digest_extra.c\
diff --git a/src/.gitignore b/src/.gitignore
index a8e3184..6cbc9d2 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -24,7 +24,7 @@
util/bot/perl-win32
util/bot/perl-win32.zip
util/bot/sde-linux64
-util/bot/sde-linux64.tar.bz2
+util/bot/sde-linux64.tar.xz
util/bot/sde-win32
-util/bot/sde-win32.tar.bz2
+util/bot/sde-win32.tar.xz
util/bot/win_toolchain.json
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f74e233..35ff4c1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -257,10 +257,11 @@
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
endif()
-# pthread_rwlock_t on Linux requires a feature flag. However, it should not be
-# set on Apple platforms, where it instead disables APIs we use. See compat(5)
-# and sys/cdefs.h.
-if(NOT WIN32 AND NOT APPLE)
+# pthread_rwlock_t on Linux requires a feature flag. We limit this to Linux
+# because, on Apple platforms, it instead disables APIs we use. See compat(5)
+# and sys/cdefs.h. Reportedly, FreeBSD also breaks when this is set. See
+# https://crbug.com/boringssl/471.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_XOPEN_SOURCE=700")
endif()
@@ -592,7 +593,7 @@
add_subdirectory(ssl)
add_subdirectory(ssl/test)
add_subdirectory(tool)
-add_subdirectory(util/fipstools/cavp)
+add_subdirectory(util/fipstools)
add_subdirectory(util/fipstools/acvp/modulewrapper)
add_subdirectory(decrepit)
@@ -617,7 +618,7 @@
endif()
endif()
-if(UNIX AND NOT APPLE AND NOT ANDROID)
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
set(HANDSHAKER_ARGS "-handshaker-path" $<TARGET_FILE:handshaker>)
endif()
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index d9cfa5c..6ab74b8 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -256,6 +256,7 @@
cipher_extra/e_aesctrhmac.c
cipher_extra/e_aesgcmsiv.c
cipher_extra/e_chacha20poly1305.c
+ cipher_extra/e_des.c
cipher_extra/e_null.c
cipher_extra/e_rc2.c
cipher_extra/e_rc4.c
@@ -274,6 +275,7 @@
crypto.c
curve25519/curve25519.c
curve25519/spake25519.c
+ des/des.c
dh_extra/params.c
dh_extra/dh_asn1.c
digest_extra/digest_extra.c
diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c
index 9e77375..387eaff 100644
--- a/src/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/src/crypto/cipher_extra/e_aesgcmsiv.c
@@ -857,22 +857,15 @@
#if defined(AES_GCM_SIV_ASM)
-static char avx_aesni_capable(void) {
- const uint32_t ecx = OPENSSL_ia32cap_P[1];
-
- return (ecx & (1 << (57 - 32))) != 0 /* AESNI */ &&
- (ecx & (1 << 28)) != 0 /* AVX */;
-}
-
const EVP_AEAD *EVP_aead_aes_128_gcm_siv(void) {
- if (avx_aesni_capable()) {
+ if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
return &aead_aes_128_gcm_siv_asm;
}
return &aead_aes_128_gcm_siv;
}
const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) {
- if (avx_aesni_capable()) {
+ if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
return &aead_aes_256_gcm_siv_asm;
}
return &aead_aes_256_gcm_siv;
diff --git a/src/crypto/fipsmodule/cipher/e_des.c b/src/crypto/cipher_extra/e_des.c
similarity index 64%
rename from src/crypto/fipsmodule/cipher/e_des.c
rename to src/crypto/cipher_extra/e_des.c
index e77363b..087029b 100644
--- a/src/crypto/fipsmodule/cipher/e_des.c
+++ b/src/crypto/cipher_extra/e_des.c
@@ -59,7 +59,6 @@
#include <openssl/nid.h>
#include "internal.h"
-#include "../delocate.h"
typedef struct {
@@ -88,17 +87,21 @@
return 1;
}
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_cbc) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_cbc;
- out->block_size = 8;
- out->key_len = 8;
- out->iv_len = 8;
- out->ctx_size = sizeof(EVP_DES_KEY);
- out->flags = EVP_CIPH_CBC_MODE;
- out->init = des_init_key;
- out->cipher = des_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_cbc = {
+ /* nid = */ NID_des_cbc,
+ /* block_size = */ 8,
+ /* key_len = */ 8,
+ /* iv_len = */ 8,
+ /* ctx_size = */ sizeof(EVP_DES_KEY),
+ /* flags = */ EVP_CIPH_CBC_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_init_key,
+ /* cipher = */ des_cbc_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_cbc(void) { return &evp_des_cbc; }
static int des_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
size_t in_len) {
@@ -107,25 +110,29 @@
}
in_len -= ctx->cipher->block_size;
- EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
+ EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data;
for (size_t i = 0; i <= in_len; i += ctx->cipher->block_size) {
- DES_ecb_encrypt((DES_cblock *) (in + i), (DES_cblock *) (out + i),
+ DES_ecb_encrypt((DES_cblock *)(in + i), (DES_cblock *)(out + i),
&dat->ks.ks, ctx->encrypt);
}
return 1;
}
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ecb) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_ecb;
- out->block_size = 8;
- out->key_len = 8;
- out->iv_len = 0;
- out->ctx_size = sizeof(EVP_DES_KEY);
- out->flags = EVP_CIPH_ECB_MODE;
- out->init = des_init_key;
- out->cipher = des_ecb_cipher;
-}
+static const EVP_CIPHER evp_des_ecb = {
+ /* nid = */ NID_des_ecb,
+ /* block_size = */ 8,
+ /* key_len = */ 8,
+ /* iv_len = */ 0,
+ /* ctx_size = */ sizeof(EVP_DES_KEY),
+ /* flags = */ EVP_CIPH_ECB_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_init_key,
+ /* cipher = */ des_ecb_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ecb(void) { return &evp_des_ecb; }
typedef struct {
union {
@@ -137,7 +144,7 @@
static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
const uint8_t *iv, int enc) {
DES_cblock *deskey = (DES_cblock *)key;
- DES_EDE_KEY *dat = (DES_EDE_KEY*) ctx->cipher_data;
+ DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
DES_set_key(&deskey[0], &dat->ks.ks[0]);
DES_set_key(&deskey[1], &dat->ks.ks[1]);
@@ -147,8 +154,8 @@
}
static int des_ede3_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
- const uint8_t *in, size_t in_len) {
- DES_EDE_KEY *dat = (DES_EDE_KEY*) ctx->cipher_data;
+ const uint8_t *in, size_t in_len) {
+ DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
DES_ede3_cbc_encrypt(in, out, in_len, &dat->ks.ks[0], &dat->ks.ks[1],
&dat->ks.ks[2], (DES_cblock *)ctx->iv, ctx->encrypt);
@@ -156,22 +163,26 @@
return 1;
}
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede3_cbc) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_ede3_cbc;
- out->block_size = 8;
- out->key_len = 24;
- out->iv_len = 8;
- out->ctx_size = sizeof(DES_EDE_KEY);
- out->flags = EVP_CIPH_CBC_MODE;
- out->init = des_ede3_init_key;
- out->cipher = des_ede3_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_ede3_cbc = {
+ /* nid = */ NID_des_ede3_cbc,
+ /* block_size = */ 8,
+ /* key_len = */ 24,
+ /* iv_len = */ 8,
+ /* ctx_size = */ sizeof(DES_EDE_KEY),
+ /* flags = */ EVP_CIPH_CBC_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_ede3_init_key,
+ /* cipher = */ des_ede3_cbc_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede3_cbc(void) { return &evp_des_ede3_cbc; }
static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
- const uint8_t *iv, int enc) {
- DES_cblock *deskey = (DES_cblock *) key;
- DES_EDE_KEY *dat = (DES_EDE_KEY *) ctx->cipher_data;
+ const uint8_t *iv, int enc) {
+ DES_cblock *deskey = (DES_cblock *)key;
+ DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
DES_set_key(&deskey[0], &dat->ks.ks[0]);
DES_set_key(&deskey[1], &dat->ks.ks[1]);
@@ -180,17 +191,21 @@
return 1;
}
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede_cbc) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_ede_cbc;
- out->block_size = 8;
- out->key_len = 16;
- out->iv_len = 8;
- out->ctx_size = sizeof(DES_EDE_KEY);
- out->flags = EVP_CIPH_CBC_MODE;
- out->init = des_ede_init_key;
- out->cipher = des_ede3_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_ede_cbc = {
+ /* nid = */ NID_des_ede_cbc,
+ /* block_size = */ 8,
+ /* key_len = */ 16,
+ /* iv_len = */ 8,
+ /* ctx_size = */ sizeof(DES_EDE_KEY),
+ /* flags = */ EVP_CIPH_CBC_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_ede_init_key,
+ /* cipher = */ des_ede3_cbc_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede_cbc(void) { return &evp_des_ede_cbc; }
static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
const uint8_t *in, size_t in_len) {
@@ -208,30 +223,36 @@
return 1;
}
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_ede_ecb;
- out->block_size = 8;
- out->key_len = 16;
- out->iv_len = 0;
- out->ctx_size = sizeof(DES_EDE_KEY);
- out->flags = EVP_CIPH_ECB_MODE;
- out->init = des_ede_init_key;
- out->cipher = des_ede_ecb_cipher;
-}
+static const EVP_CIPHER evp_des_ede = {
+ /* nid = */ NID_des_ede_ecb,
+ /* block_size = */ 8,
+ /* key_len = */ 16,
+ /* iv_len = */ 0,
+ /* ctx_size = */ sizeof(DES_EDE_KEY),
+ /* flags = */ EVP_CIPH_ECB_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_ede_init_key,
+ /* cipher = */ des_ede_ecb_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede3) {
- memset(out, 0, sizeof(EVP_CIPHER));
- out->nid = NID_des_ede3_ecb;
- out->block_size = 8;
- out->key_len = 24;
- out->iv_len = 0;
- out->ctx_size = sizeof(DES_EDE_KEY);
- out->flags = EVP_CIPH_ECB_MODE;
- out->init = des_ede3_init_key;
- out->cipher = des_ede_ecb_cipher;
-}
+const EVP_CIPHER *EVP_des_ede(void) { return &evp_des_ede; }
-const EVP_CIPHER* EVP_des_ede3_ecb(void) {
- return EVP_des_ede3();
-}
+static const EVP_CIPHER evp_des_ede3 = {
+ /* nid = */ NID_des_ede3_ecb,
+ /* block_size = */ 8,
+ /* key_len = */ 24,
+ /* iv_len = */ 0,
+ /* ctx_size = */ sizeof(DES_EDE_KEY),
+ /* flags = */ EVP_CIPH_ECB_MODE,
+ /* app_data = */ NULL,
+ /* init = */ des_ede3_init_key,
+ /* cipher = */ des_ede_ecb_cipher,
+ /* cleanup = */ NULL,
+ /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede3(void) { return &evp_des_ede3; }
+
+const EVP_CIPHER *EVP_des_ede3_ecb(void) { return EVP_des_ede3(); }
diff --git a/src/crypto/cipher_extra/internal.h b/src/crypto/cipher_extra/internal.h
index 0f5f566..4e8fa46 100644
--- a/src/crypto/cipher_extra/internal.h
+++ b/src/crypto/cipher_extra/internal.h
@@ -171,8 +171,7 @@
"wrong chacha20_poly1305_seal_data size");
OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) {
- const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
- return sse41_capable;
+ return CRYPTO_is_SSE4_1_capable();
}
// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts
diff --git a/src/crypto/curve25519/curve25519.c b/src/crypto/curve25519/curve25519.c
index 64aa1e6..7cb0add 100644
--- a/src/crypto/curve25519/curve25519.c
+++ b/src/crypto/curve25519/curve25519.c
@@ -502,27 +502,21 @@
int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
fe u;
fe_loose v;
- fe v3;
+ fe w;
fe vxx;
fe_loose check;
fe_frombytes(&h->Y, s);
fe_1(&h->Z);
- fe_sq_tt(&v3, &h->Y);
- fe_mul_ttt(&vxx, &v3, &d);
- fe_sub(&v, &v3, &h->Z); // u = y^2-1
+ fe_sq_tt(&w, &h->Y);
+ fe_mul_ttt(&vxx, &w, &d);
+ fe_sub(&v, &w, &h->Z); // u = y^2-1
fe_carry(&u, &v);
fe_add(&v, &vxx, &h->Z); // v = dy^2+1
- fe_sq_tl(&v3, &v);
- fe_mul_ttl(&v3, &v3, &v); // v3 = v^3
- fe_sq_tt(&h->X, &v3);
- fe_mul_ttl(&h->X, &h->X, &v);
- fe_mul_ttt(&h->X, &h->X, &u); // x = uv^7
-
- fe_pow22523(&h->X, &h->X); // x = (uv^7)^((q-5)/8)
- fe_mul_ttt(&h->X, &h->X, &v3);
- fe_mul_ttt(&h->X, &h->X, &u); // x = uv^3(uv^7)^((q-5)/8)
+ fe_mul_ttl(&w, &u, &v); // w = u*v
+ fe_pow22523(&h->X, &w); // x = w^((q-5)/8)
+ fe_mul_ttt(&h->X, &h->X, &u); // x = u*w^((q-5)/8)
fe_sq_tt(&vxx, &h->X);
fe_mul_ttl(&vxx, &vxx, &v);
diff --git a/src/crypto/fipsmodule/des/des.c b/src/crypto/des/des.c
similarity index 100%
rename from src/crypto/fipsmodule/des/des.c
rename to src/crypto/des/des.c
diff --git a/src/crypto/fipsmodule/des/internal.h b/src/crypto/des/internal.h
similarity index 99%
rename from src/crypto/fipsmodule/des/internal.h
rename to src/crypto/des/internal.h
index 3e3992e..2124fd5 100644
--- a/src/crypto/fipsmodule/des/internal.h
+++ b/src/crypto/des/internal.h
@@ -59,7 +59,7 @@
#include <openssl/base.h>
-#include "../../internal.h"
+#include "../internal.h"
#if defined(__cplusplus)
extern "C" {
diff --git a/src/crypto/err/ssl.errordata b/src/crypto/err/ssl.errordata
index 6879134..4205402 100644
--- a/src/crypto/err/ssl.errordata
+++ b/src/crypto/err/ssl.errordata
@@ -90,6 +90,7 @@
SSL,318,INVALID_ECH_CONFIG_LIST
SSL,317,INVALID_ECH_PUBLIC_NAME
SSL,159,INVALID_MESSAGE
+SSL,320,INVALID_OUTER_EXTENSION
SSL,251,INVALID_OUTER_RECORD_TYPE
SSL,269,INVALID_SCT_LIST
SSL,295,INVALID_SIGNATURE_ALGORITHM
@@ -133,7 +134,6 @@
SSL,187,OLD_SESSION_CIPHER_NOT_RETURNED
SSL,268,OLD_SESSION_PRF_HASH_MISMATCH
SSL,188,OLD_SESSION_VERSION_NOT_RETURNED
-SSL,320,OUTER_EXTENSION_NOT_FOUND
SSL,189,OUTPUT_ALIASES_INPUT
SSL,190,PARSE_TLSEXT
SSL,191,PATH_TOO_LONG
diff --git a/src/crypto/fipsmodule/FIPS.md b/src/crypto/fipsmodule/FIPS.md
index d3b3890..bc5708f 100644
--- a/src/crypto/fipsmodule/FIPS.md
+++ b/src/crypto/fipsmodule/FIPS.md
@@ -12,30 +12,21 @@
1. 2018-07-30: certificate [#3318](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3318), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20180730.docx) (in docx format).
1. 2019-08-08: certificate [#3678](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3678), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20190808.docx) (in docx format).
1. 2019-10-20: certificate [#3753](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3753), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20191020.docx) (in docx format).
+1. 2021-01-28: certificate [#4156](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/4156), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx) (in docx format).
-## Running CAVP tests
+## Running ACVP tests
-CAVP results are calculated by `util/fipstools/cavp`, but that binary is almost always run by `util/fipstools/run_cavp.go`. The latter knows the set of tests to be processed and the flags needed to configure `cavp` for each one. It must be run from the top of a CAVP directory and needs the following options:
+See `util/fipstools/acvp/ACVP.md` for details of how ACVP testing is done.
-1. `-oracle-bin`: points to the location of `util/fipstools/cavp`
-2. `-no-fax`: this is needed to suppress checking of the FAX files, which are only included in sample sets.
+## Breaking known-answer and continuous tests
-## Breaking power-on and continuous tests
+Each known-answer test (KAT) uses a unique, random input value. `util/fipstools/break-kat.go` contains a listing of those values and can be used to corrupt a given test in a binary. Since changes to the KAT input values will invalidate the integrity test, `BORINGSSL_FIPS_BREAK_TESTS` can be defined in `fips_break_tests.h` to disable it for the purposes of testing.
-In order to demonstrate failures of the various FIPS 140 tests, BoringSSL can be built in ways that will trigger such failures. This is controlled by passing `-DFIPS_BREAK_TEST=`(test to break) to CMake, where the following tests can be specified:
+Some FIPS tests cannot be broken by replacing a known string in the binary. For those, when `BORINGSSL_FIPS_BREAK_TESTS` is defined, the environment variable `BORINGSSL_FIPS_BREAK_TEST` can be set to one of a number of values in order to break the corresponding test:
-1. AES\_CBC
-1. AES\_GCM
-1. DES
-1. SHA\_1
-1. SHA\_256
-1. SHA\_512
-1. RSA\_SIG
-1. ECDSA\_SIG
-1. DRBG
-1. RSA\_PWCT
-1. ECDSA\_PWCT
-1. TLS\_KDF
+1. `RSA_PWCT`
+1. `ECDSA_PWCT`
+1. `CRNG`
## Breaking the integrity test
@@ -61,12 +52,6 @@
FIPS requires that RNG state be zeroed when the process exits. In order to implement this, all per-thread RNG states are tracked in a linked list and a destructor function is included which clears them. In order for this to be safe in the presence of threads, a lock is used to stop all other threads from using the RNG once this process has begun. Thus the main thread exiting may cause other threads to deadlock, and drawing on entropy in a destructor function may also deadlock.
-## Self-test optimisation
-
-On Android, the self-tests are optimised in line with [IG](https://csrc.nist.gov/csrc/media/projects/cryptographic-module-validation-program/documents/fips140-2/fips1402ig.pdf) section 9.11. The module will always perform the integrity test at power-on, but the self-tests will test for the presence of a file named after the hex encoded, HMAC-SHA-256 hash of the module in `/dev/boringssl/selftest/`. If such a file is found then the self-tests are skipped. Otherwise, after the self-tests complete successfully, that file will be written. Any I/O errors are ignored and, if they occur when testing for the presence of the file, the module acts as if it's not present.
-
-It is intended that a `tmpfs` be mounted at that location in order to skip running the self tests for every process once they have already passed in a given instance of the operating system.
-
## Integrity Test
FIPS-140 mandates that a module calculate an HMAC of its own code in a constructor function and compare the result to a known-good value. Typical code produced by a C compiler includes large numbers of relocations: places in the machine code where the linker needs to resolve and inject the final value of a symbolic expression. These relocations mean that the bytes that make up any specific bit of code generally aren't known until the final link has completed.
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index 9f7dd47..0685bc4 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -30,18 +30,14 @@
#define HWAES
#define HWAES_ECB
-OPENSSL_INLINE int hwaes_capable(void) {
- return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0;
-}
+OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_AESNI_capable(); }
#define VPAES
#if defined(OPENSSL_X86_64)
#define VPAES_CTR32
#endif
#define VPAES_CBC
-OPENSSL_INLINE int vpaes_capable(void) {
- return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
+OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_SSSE3_capable(); }
#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
#define HWAES
diff --git a/src/crypto/fipsmodule/bcm.c b/src/crypto/fipsmodule/bcm.c
index 639235e..1219bc7 100644
--- a/src/crypto/fipsmodule/bcm.c
+++ b/src/crypto/fipsmodule/bcm.c
@@ -58,8 +58,6 @@
#include "cipher/aead.c"
#include "cipher/cipher.c"
#include "cipher/e_aes.c"
-#include "cipher/e_des.c"
-#include "des/des.c"
#include "dh/check.c"
#include "dh/dh.c"
#include "digest/digest.c"
@@ -192,16 +190,23 @@
#endif
assert_within(rodata_start, kPrimes, rodata_end);
- assert_within(rodata_start, des_skb, rodata_end);
assert_within(rodata_start, kP256Params, rodata_end);
assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end);
#if defined(OPENSSL_AARCH64) || defined(OPENSSL_ANDROID)
uint8_t result[SHA256_DIGEST_LENGTH];
const EVP_MD *const kHashFunction = EVP_sha256();
+ if (!boringssl_self_test_sha256() ||
+ !boringssl_self_test_hmac_sha256()) {
+ goto err;
+ }
#else
uint8_t result[SHA512_DIGEST_LENGTH];
const EVP_MD *const kHashFunction = EVP_sha512();
+ if (!boringssl_self_test_sha512() ||
+ !boringssl_self_test_hmac_sha256()) {
+ goto err;
+ }
#endif
static const uint8_t kHMACKey[64] = {0};
@@ -238,20 +243,18 @@
const uint8_t *expected = BORINGSSL_bcm_text_hash;
if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) {
+#if !defined(BORINGSSL_FIPS_BREAK_TESTS)
goto err;
+#endif
}
OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10.
-
- if (!boringssl_fips_self_test(BORINGSSL_bcm_text_hash, sizeof(result))) {
- goto err;
- }
-#else
- if (!BORINGSSL_self_test()) {
- goto err;
- }
#endif // OPENSSL_ASAN
+ if (!boringssl_self_test_startup()) {
+ goto err;
+ }
+
return;
err:
diff --git a/src/crypto/fipsmodule/bn/rsaz_exp.h b/src/crypto/fipsmodule/bn/rsaz_exp.h
index 2f0c2c0..104bb7a 100644
--- a/src/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/src/crypto/fipsmodule/bn/rsaz_exp.h
@@ -41,18 +41,17 @@
BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
OPENSSL_INLINE int rsaz_avx2_capable(void) {
- const uint32_t *cap = OPENSSL_ia32cap_get();
- return (cap[2] & (1 << 5)) != 0; // AVX2
+ return CRYPTO_is_AVX2_capable();
}
OPENSSL_INLINE int rsaz_avx2_preferred(void) {
- const uint32_t *cap = OPENSSL_ia32cap_get();
- static const uint32_t kBMI2AndADX = (1 << 8) | (1 << 19);
- if ((cap[2] & kBMI2AndADX) == kBMI2AndADX) {
- // If BMI2 and ADX are available, x86_64-mont5.pl is faster.
+ if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
+ CRYPTO_is_ADX_capable()) {
+ // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
+ // .Lmulx4x_enter and .Lpowerx5_enter branches.
return 0;
}
- return (cap[2] & (1 << 5)) != 0; // AVX2
+ return CRYPTO_is_AVX2_capable();
}
diff --git a/src/crypto/fipsmodule/dh/dh.c b/src/crypto/fipsmodule/dh/dh.c
index ab596e9..b59afc6 100644
--- a/src/crypto/fipsmodule/dh/dh.c
+++ b/src/crypto/fipsmodule/dh/dh.c
@@ -64,6 +64,7 @@
#include <openssl/mem.h>
#include <openssl/thread.h>
+#include "internal.h"
#include "../../internal.h"
#include "../bn/internal.h"
@@ -186,6 +187,8 @@
}
int DH_generate_key(DH *dh) {
+ boringssl_ensure_ffdh_self_test();
+
int ok = 0;
int generate_new_key = 0;
BN_CTX *ctx = NULL;
@@ -322,7 +325,8 @@
return ret;
}
-int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+int dh_compute_key_padded_no_self_test(unsigned char *out,
+ const BIGNUM *peers_key, DH *dh) {
BN_CTX *ctx = BN_CTX_new();
if (ctx == NULL) {
return -1;
@@ -343,7 +347,15 @@
return ret;
}
+int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+ boringssl_ensure_ffdh_self_test();
+
+ return dh_compute_key_padded_no_self_test(out, peers_key, dh);
+}
+
int DH_compute_key(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+ boringssl_ensure_ffdh_self_test();
+
BN_CTX *ctx = BN_CTX_new();
if (ctx == NULL) {
return -1;
diff --git a/src/crypto/fipsmodule/dh/internal.h b/src/crypto/fipsmodule/dh/internal.h
new file mode 100644
index 0000000..c40172d
--- /dev/null
+++ b/src/crypto/fipsmodule/dh/internal.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// dh_compute_key_padded_no_self_test does the same as |DH_compute_key_padded|,
+// but doesn't try to run the self-test first. This is for use in the self tests
+// themselves, to prevent an infinite loop.
+int dh_compute_key_padded_no_self_test(unsigned char *out,
+ const BIGNUM *peers_key, DH *dh);
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index 1f03e15..93fdcfc 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -943,8 +943,9 @@
return ok;
}
-int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
- const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) {
+int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r,
+ const BIGNUM *g_scalar, const EC_POINT *p,
+ const BIGNUM *p_scalar, BN_CTX *ctx) {
// Previously, this function set |r| to the point at infinity if there was
// nothing to multiply. But, nobody should be calling this function with
// nothing to multiply in the first place.
@@ -1010,6 +1011,13 @@
return ret;
}
+int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
+ const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) {
+ boringssl_ensure_ecc_self_test();
+
+ return ec_point_mul_no_self_test(group, r, g_scalar, p, p_scalar, ctx);
+}
+
int ec_point_mul_scalar_public(const EC_GROUP *group, EC_RAW_POINT *r,
const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
const EC_SCALAR *p_scalar) {
diff --git a/src/crypto/fipsmodule/ec/ec_key.c b/src/crypto/fipsmodule/ec/ec_key.c
index 7a6daab..d7acf96 100644
--- a/src/crypto/fipsmodule/ec/ec_key.c
+++ b/src/crypto/fipsmodule/ec/ec_key.c
@@ -339,9 +339,9 @@
if (key->priv_key) {
uint8_t data[16] = {0};
ECDSA_SIG *sig = ECDSA_do_sign(data, sizeof(data), key);
-#if defined(BORINGSSL_FIPS_BREAK_ECDSA_PWCT)
- data[0] = ~data[0];
-#endif
+ if (boringssl_fips_break_test("ECDSA_PWCT")) {
+ data[0] = ~data[0];
+ }
int ok = sig != NULL &&
ECDSA_do_verify(data, sizeof(data), sig, key);
ECDSA_SIG_free(sig);
@@ -439,6 +439,8 @@
}
int EC_KEY_generate_key_fips(EC_KEY *eckey) {
+ boringssl_ensure_ecc_self_test();
+
if (EC_KEY_generate_key(eckey) && EC_KEY_check_fips(eckey)) {
return 1;
}
diff --git a/src/crypto/fipsmodule/ec/internal.h b/src/crypto/fipsmodule/ec/internal.h
index 289c3aa..488adb8 100644
--- a/src/crypto/fipsmodule/ec/internal.h
+++ b/src/crypto/fipsmodule/ec/internal.h
@@ -301,6 +301,13 @@
int ec_point_set_affine_coordinates(const EC_GROUP *group, EC_AFFINE *out,
const EC_FELEM *x, const EC_FELEM *y);
+// ec_point_mul_no_self_test does the same as |EC_POINT_mul|, but doesn't try to
+// run the self-test first. This is for use in the self tests themselves, to
+// prevent an infinite loop.
+int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r,
+ const BIGNUM *g_scalar, const EC_POINT *p,
+ const BIGNUM *p_scalar, BN_CTX *ctx);
+
// ec_point_mul_scalar sets |r| to |p| * |scalar|. Both inputs are considered
// secret.
int ec_point_mul_scalar(const EC_GROUP *group, EC_RAW_POINT *r,
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index 99deb36..506b7d2 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -554,7 +554,7 @@
static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
EC_SCALAR *out,
const EC_SCALAR *in) {
- if ((OPENSSL_ia32cap_get()[1] & (1 << 28)) == 0) {
+ if (!CRYPTO_is_AVX_capable()) {
// No AVX support; fallback to generic code.
return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
}
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index a083f3d..f6f070a 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -98,7 +98,7 @@
}
TEST(P256_X86_64Test, BEEU) {
- if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+ if (!CRYPTO_is_AVX_capable()) {
// No AVX support; cannot run the BEEU code.
return;
}
diff --git a/src/crypto/fipsmodule/ecdh/ecdh.c b/src/crypto/fipsmodule/ecdh/ecdh.c
index 4e6d0bf..36fbadc 100644
--- a/src/crypto/fipsmodule/ecdh/ecdh.c
+++ b/src/crypto/fipsmodule/ecdh/ecdh.c
@@ -75,10 +75,13 @@
#include <openssl/sha.h>
#include "../ec/internal.h"
+#include "../../internal.h"
int ECDH_compute_key_fips(uint8_t *out, size_t out_len, const EC_POINT *pub_key,
const EC_KEY *priv_key) {
+ boringssl_ensure_ecc_self_test();
+
if (priv_key->priv_key == NULL) {
OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE);
return 0;
diff --git a/src/crypto/fipsmodule/ecdsa/ecdsa.c b/src/crypto/fipsmodule/ecdsa/ecdsa.c
index 5d99903..db0c6e5 100644
--- a/src/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/src/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -151,8 +151,8 @@
return 1;
}
-int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
- const ECDSA_SIG *sig, const EC_KEY *eckey) {
+int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
+ const ECDSA_SIG *sig, const EC_KEY *eckey) {
const EC_GROUP *group = EC_KEY_get0_group(eckey);
const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey);
if (group == NULL || pub_key == NULL || sig == NULL) {
@@ -198,6 +198,13 @@
return 1;
}
+int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
+ const ECDSA_SIG *sig, const EC_KEY *eckey) {
+ boringssl_ensure_ecc_self_test();
+
+ return ecdsa_do_verify_no_self_test(digest, digest_len, sig, eckey);
+}
+
static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry,
const EC_SCALAR *priv_key, const EC_SCALAR *k,
const uint8_t *digest, size_t digest_len) {
@@ -292,12 +299,16 @@
ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing(
const uint8_t *digest, size_t digest_len, const EC_KEY *eckey,
const uint8_t *nonce, size_t nonce_len) {
+ boringssl_ensure_ecc_self_test();
+
return ecdsa_sign_with_nonce_for_known_answer_test(digest, digest_len, eckey,
nonce, nonce_len);
}
ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
const EC_KEY *eckey) {
+ boringssl_ensure_ecc_self_test();
+
if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED);
return NULL;
diff --git a/src/crypto/fipsmodule/ecdsa/internal.h b/src/crypto/fipsmodule/ecdsa/internal.h
index 5115dfa..645959f 100644
--- a/src/crypto/fipsmodule/ecdsa/internal.h
+++ b/src/crypto/fipsmodule/ecdsa/internal.h
@@ -31,6 +31,12 @@
const uint8_t *nonce,
size_t nonce_len);
+// ecdsa_do_verify_no_self_test does the same as |ECDSA_do_verify|, but doesn't
+// try to run the self-test first. This is for use in the self tests themselves,
+// to prevent an infinite loop.
+int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
+ const ECDSA_SIG *sig, const EC_KEY *eckey);
+
#if defined(__cplusplus)
}
diff --git a/src/crypto/fipsmodule/modes/gcm.c b/src/crypto/fipsmodule/modes/gcm.c
index 28218b4..5b909aa 100644
--- a/src/crypto/fipsmodule/modes/gcm.c
+++ b/src/crypto/fipsmodule/modes/gcm.c
@@ -152,7 +152,7 @@
#if defined(GHASH_ASM_X86_64)
if (crypto_gcm_clmul_enabled()) {
- if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) { // AVX+MOVBE
+ if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
gcm_init_avx(out_table, H.u);
*out_mult = gcm_gmult_avx;
*out_hash = gcm_ghash_avx;
@@ -164,7 +164,7 @@
*out_hash = gcm_ghash_clmul;
return;
}
- if (gcm_ssse3_capable()) {
+ if (CRYPTO_is_SSSE3_capable()) {
gcm_init_ssse3(out_table, H.u);
*out_mult = gcm_gmult_ssse3;
*out_hash = gcm_ghash_ssse3;
@@ -177,7 +177,7 @@
*out_hash = gcm_ghash_clmul;
return;
}
- if (gcm_ssse3_capable()) {
+ if (CRYPTO_is_SSSE3_capable()) {
gcm_init_ssse3(out_table, H.u);
*out_mult = gcm_gmult_ssse3;
*out_hash = gcm_ghash_ssse3;
@@ -722,9 +722,7 @@
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
int crypto_gcm_clmul_enabled(void) {
#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
- const uint32_t *ia32cap = OPENSSL_ia32cap_get();
- return (ia32cap[0] & (1 << 24)) && // check FXSR bit
- (ia32cap[1] & (1 << 1)); // check PCLMULQDQ bit
+ return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable();
#else
return 0;
#endif
diff --git a/src/crypto/fipsmodule/modes/gcm_test.cc b/src/crypto/fipsmodule/modes/gcm_test.cc
index 539b764..d66d8ae 100644
--- a/src/crypto/fipsmodule/modes/gcm_test.cc
+++ b/src/crypto/fipsmodule/modes/gcm_test.cc
@@ -136,7 +136,7 @@
alignas(16) u128 Htable[16];
#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
- if (gcm_ssse3_capable()) {
+ if (CRYPTO_is_SSSE3_capable()) {
CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
for (size_t blocks : kBlockCounts) {
@@ -152,7 +152,7 @@
}
#if defined(GHASH_ASM_X86_64)
- if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) { // AVX+MOVBE
+ if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
CHECK_ABI_SEH(gcm_init_avx, Htable, kH);
CHECK_ABI_SEH(gcm_gmult_avx, X, Htable);
for (size_t blocks : kBlockCounts) {
diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h
index f022f9b..0164aac 100644
--- a/src/crypto/fipsmodule/modes/internal.h
+++ b/src/crypto/fipsmodule/modes/internal.h
@@ -253,10 +253,6 @@
void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
size_t len);
-OPENSSL_INLINE char gcm_ssse3_capable(void) {
- return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
-
// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
// 16-byte-aligned, but |gcm_init_ssse3| does not.
void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
diff --git a/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx b/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx
new file mode 100644
index 0000000..17fcd25
--- /dev/null
+++ b/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx
Binary files differ
diff --git a/src/crypto/fipsmodule/rand/internal.h b/src/crypto/fipsmodule/rand/internal.h
index bbeef76..eccf047 100644
--- a/src/crypto/fipsmodule/rand/internal.h
+++ b/src/crypto/fipsmodule/rand/internal.h
@@ -143,15 +143,14 @@
#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
OPENSSL_INLINE int have_rdrand(void) {
- return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+ return CRYPTO_is_RDRAND_capable();
}
// have_fast_rdrand returns true if RDRAND is supported and it's reasonably
// fast. Concretely the latter is defined by whether the chip is Intel (fast) or
// not (assumed slow).
OPENSSL_INLINE int have_fast_rdrand(void) {
- const uint32_t *const ia32cap = OPENSSL_ia32cap_get();
- return (ia32cap[1] & (1u << 30)) && (ia32cap[0] & (1u << 30));
+ return CRYPTO_is_RDRAND_capable() && CRYPTO_is_intel_cpu();
}
// CRYPTO_rdrand writes eight bytes of random data from the hardware RNG to
diff --git a/src/crypto/fipsmodule/rand/rand.c b/src/crypto/fipsmodule/rand/rand.c
index 9c54fc5..357be39 100644
--- a/src/crypto/fipsmodule/rand/rand.c
+++ b/src/crypto/fipsmodule/rand/rand.c
@@ -170,11 +170,11 @@
CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len);
}
-#if defined(BORINGSSL_FIPS_BREAK_CRNG)
- // This breaks the "continuous random number generator test" defined in FIPS
- // 140-2, section 4.9.2, and implemented in |rand_get_seed|.
- OPENSSL_memset(out_entropy, 0, out_entropy_len);
-#endif
+ if (boringssl_fips_break_test("CRNG")) {
+ // This breaks the "continuous random number generator test" defined in FIPS
+ // 140-2, section 4.9.2, and implemented in |rand_get_seed|.
+ OPENSSL_memset(out_entropy, 0, out_entropy_len);
+ }
}
// In passive entropy mode, entropy is supplied from outside of the module via
diff --git a/src/crypto/fipsmodule/rsa/internal.h b/src/crypto/fipsmodule/rsa/internal.h
index d9d6fac..1cb3b5f 100644
--- a/src/crypto/fipsmodule/rsa/internal.h
+++ b/src/crypto/fipsmodule/rsa/internal.h
@@ -124,6 +124,28 @@
extern const size_t kBoringSSLRSASqrtTwoLen;
+// Functions that avoid self-tests.
+//
+// Self-tests need to call functions that don't try and ensure that the
+// self-tests have passed. These functions, in turn, need to limit themselves
+// to such functions too.
+//
+// These functions are the same as their public versions, but skip the self-test
+// check.
+
+int rsa_verify_no_self_test(int hash_nid, const uint8_t *digest,
+ size_t digest_len, const uint8_t *sig,
+ size_t sig_len, RSA *rsa);
+
+int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+ size_t max_out, const uint8_t *in,
+ size_t in_len, int padding);
+
+int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest,
+ unsigned digest_len, uint8_t *out, unsigned *out_len,
+ RSA *rsa);
+
+
#if defined(__cplusplus)
} // extern C
#endif
diff --git a/src/crypto/fipsmodule/rsa/rsa.c b/src/crypto/fipsmodule/rsa/rsa.c
index 3205d7d..733e7fa 100644
--- a/src/crypto/fipsmodule/rsa/rsa.c
+++ b/src/crypto/fipsmodule/rsa/rsa.c
@@ -303,8 +303,9 @@
return out_len;
}
-int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
- const uint8_t *in, size_t in_len, int padding) {
+static int rsa_sign_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+ size_t max_out, const uint8_t *in,
+ size_t in_len, int padding) {
if (rsa->meth->sign_raw) {
return rsa->meth->sign_raw(rsa, out_len, out, max_out, in, in_len, padding);
}
@@ -312,6 +313,13 @@
return rsa_default_sign_raw(rsa, out_len, out, max_out, in, in_len, padding);
}
+int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
+ const uint8_t *in, size_t in_len, int padding) {
+ boringssl_ensure_rsa_self_test();
+ return rsa_sign_raw_no_self_test(rsa, out_len, out, max_out, in, in_len,
+ padding);
+}
+
int RSA_private_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa,
int padding) {
size_t out_len;
@@ -523,8 +531,9 @@
return 0;
}
-int RSA_sign(int hash_nid, const uint8_t *digest, unsigned digest_len,
- uint8_t *out, unsigned *out_len, RSA *rsa) {
+int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest,
+ unsigned digest_len, uint8_t *out, unsigned *out_len,
+ RSA *rsa) {
const unsigned rsa_size = RSA_size(rsa);
int ret = 0;
uint8_t *signed_msg = NULL;
@@ -539,8 +548,9 @@
if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
&signed_msg_is_alloced, hash_nid, digest,
digest_len) ||
- !RSA_sign_raw(rsa, &size_t_out_len, out, rsa_size, signed_msg,
- signed_msg_len, RSA_PKCS1_PADDING)) {
+ !rsa_sign_raw_no_self_test(rsa, &size_t_out_len, out, rsa_size,
+ signed_msg, signed_msg_len,
+ RSA_PKCS1_PADDING)) {
goto err;
}
@@ -554,6 +564,13 @@
return ret;
}
+int RSA_sign(int hash_nid, const uint8_t *digest, unsigned digest_len,
+ uint8_t *out, unsigned *out_len, RSA *rsa) {
+ boringssl_ensure_rsa_self_test();
+
+ return rsa_sign_no_self_test(hash_nid, digest, digest_len, out, out_len, rsa);
+}
+
int RSA_sign_pss_mgf1(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
const uint8_t *digest, size_t digest_len,
const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len) {
@@ -577,8 +594,9 @@
return ret;
}
-int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len,
- const uint8_t *sig, size_t sig_len, RSA *rsa) {
+int rsa_verify_no_self_test(int hash_nid, const uint8_t *digest,
+ size_t digest_len, const uint8_t *sig,
+ size_t sig_len, RSA *rsa) {
if (rsa->n == NULL || rsa->e == NULL) {
OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING);
return 0;
@@ -602,12 +620,9 @@
return 0;
}
- if (!RSA_verify_raw(rsa, &len, buf, rsa_size, sig, sig_len,
- RSA_PKCS1_PADDING)) {
- goto out;
- }
-
- if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
+ if (!rsa_verify_raw_no_self_test(rsa, &len, buf, rsa_size, sig, sig_len,
+ RSA_PKCS1_PADDING) ||
+ !RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
&signed_msg_is_alloced, hash_nid, digest,
digest_len)) {
goto out;
@@ -630,6 +645,13 @@
return ret;
}
+int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len,
+ const uint8_t *sig, size_t sig_len, RSA *rsa) {
+ boringssl_ensure_rsa_self_test();
+ return rsa_verify_no_self_test(hash_nid, digest, digest_len, sig, sig_len,
+ rsa);
+}
+
int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len,
const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len,
const uint8_t *sig, size_t sig_len) {
@@ -905,9 +927,9 @@
ret = 0;
goto cleanup;
}
-#if defined(BORINGSSL_FIPS_BREAK_RSA_PWCT)
- data[0] = ~data[0];
-#endif
+ if (boringssl_fips_break_test("RSA_PWCT")) {
+ data[0] = ~data[0];
+ }
if (!RSA_verify(NID_sha256, data, sizeof(data), sig, sig_len, key)) {
OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR);
ret = 0;
diff --git a/src/crypto/fipsmodule/rsa/rsa_impl.c b/src/crypto/fipsmodule/rsa/rsa_impl.c
index a6865c0..1046f35 100644
--- a/src/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/src/crypto/fipsmodule/rsa/rsa_impl.c
@@ -261,6 +261,8 @@
int RSA_encrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
const uint8_t *in, size_t in_len, int padding) {
+ boringssl_ensure_rsa_self_test();
+
if (!rsa_check_public_key(rsa)) {
return 0;
}
@@ -528,6 +530,8 @@
int rsa_default_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
const uint8_t *in, size_t in_len, int padding) {
+ boringssl_ensure_rsa_self_test();
+
const unsigned rsa_size = RSA_size(rsa);
uint8_t *buf = NULL;
int ret = 0;
@@ -593,8 +597,9 @@
static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx);
-int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
- const uint8_t *in, size_t in_len, int padding) {
+int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+ size_t max_out, const uint8_t *in,
+ size_t in_len, int padding) {
if (!rsa_check_public_key(rsa)) {
return 0;
}
@@ -686,6 +691,14 @@
return ret;
}
+int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out,
+ size_t max_out, const uint8_t *in,
+ size_t in_len, int padding) {
+ boringssl_ensure_rsa_self_test();
+ return rsa_verify_raw_no_self_test(rsa, out_len, out, max_out, in, in_len,
+ padding);
+}
+
int rsa_default_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in,
size_t len) {
if (rsa->n == NULL || rsa->d == NULL) {
@@ -1324,6 +1337,8 @@
static int RSA_generate_key_ex_maybe_fips(RSA *rsa, int bits,
const BIGNUM *e_value, BN_GENCB *cb,
int check_fips) {
+ boringssl_ensure_rsa_self_test();
+
RSA *tmp = NULL;
uint32_t err;
int ret = 0;
diff --git a/src/crypto/fipsmodule/self_check/self_check.c b/src/crypto/fipsmodule/self_check/self_check.c
index 94f2da7..b7cd868 100644
--- a/src/crypto/fipsmodule/self_check/self_check.c
+++ b/src/crypto/fipsmodule/self_check/self_check.c
@@ -20,17 +20,18 @@
#include <openssl/aead.h>
#include <openssl/aes.h>
#include <openssl/bn.h>
-#include <openssl/des.h>
#include <openssl/dh.h>
#include <openssl/digest.h>
#include <openssl/ec.h>
#include <openssl/ecdsa.h>
#include <openssl/ec_key.h>
+#include <openssl/hmac.h>
#include <openssl/nid.h>
#include <openssl/rsa.h>
#include <openssl/sha.h>
#include "../../internal.h"
+#include "../dh/internal.h"
#include "../ec/internal.h"
#include "../ecdsa/internal.h"
#include "../rand/internal.h"
@@ -47,21 +48,6 @@
#else
-#if defined(BORINGSSL_FIPS) && defined(OPENSSL_ANDROID)
-// FIPS builds on Android will test for flag files, named after the module hash,
-// in /dev/boringssl/selftest/. If such a flag file exists, it's assumed that
-// self-tests have already passed and thus do not need to be repeated. (The
-// integrity tests always run, however.)
-//
-// If self-tests complete successfully and the environment variable named in
-// |kFlagWriteEnableEnvVar| is present, then the flag file will be created. The
-// flag file isn't written without the environment variable being set in order
-// to avoid SELinux violations on Android.
-#define BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
-static const char kFlagPrefix[] = "/dev/boringssl/selftest/";
-static const char kFlagWriteEnableEnvVar[] = "BORINGSSL_SELF_TEST_CREATE_FLAG";
-#endif
-
static void hexdump(const uint8_t *in, size_t len) {
for (size_t i = 0; i < len; i++) {
fprintf(stderr, "%02x", in[i]);
@@ -71,7 +57,7 @@
static int check_test(const void *expected, const void *actual,
size_t expected_len, const char *name) {
if (OPENSSL_memcmp(actual, expected, expected_len) != 0) {
- fprintf(stderr, "%s failed.\nExpected: ", name);
+ fprintf(stderr, "%s failed.\nExpected: ", name);
hexdump(expected, expected_len);
fprintf(stderr, "\nCalculated: ");
hexdump(actual, expected_len);
@@ -87,6 +73,28 @@
return *out != NULL;
}
+static int serialize_ecdsa_sig(uint8_t *out, size_t out_len,
+ const ECDSA_SIG *sig) {
+ if ((out_len & 1) || //
+ !BN_bn2bin_padded(out, out_len / 2, sig->r) ||
+ !BN_bn2bin_padded(out + out_len / 2, out_len / 2, sig->s)) {
+ return 0;
+ }
+ return 1;
+}
+
+static ECDSA_SIG *parse_ecdsa_sig(const uint8_t *in, size_t in_len) {
+ ECDSA_SIG *ret = ECDSA_SIG_new();
+ if (!ret || //
+ (in_len & 1) ||
+ BN_bin2bn(in, in_len/2, ret->r) == NULL ||
+ BN_bin2bn(in + in_len/2, in_len/2, ret->s) == NULL) {
+ ECDSA_SIG_free(ret);
+ ret = NULL;
+ }
+ return ret;
+}
+
static RSA *self_test_rsa_key(void) {
static const uint8_t kN[] = {
0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc,
@@ -289,195 +297,185 @@
return NULL;
}
-#if defined(OPENSSL_ANDROID)
-#define MODULE_DIGEST_SIZE SHA256_DIGEST_LENGTH
-#else
-#define MODULE_DIGEST_SIZE SHA512_DIGEST_LENGTH
-#endif
-int boringssl_fips_self_test(
- const uint8_t *module_hash, size_t module_hash_len) {
-#if defined(BORINGSSL_FIPS_SELF_TEST_FLAG_FILE)
- char flag_path[sizeof(kFlagPrefix) + 2 * MODULE_DIGEST_SIZE];
- if (module_hash_len != 0) {
- if (module_hash_len != MODULE_DIGEST_SIZE) {
- fprintf(stderr,
- "module hash of length %zu does not match expected length %d\n",
- module_hash_len, MODULE_DIGEST_SIZE);
- BORINGSSL_FIPS_abort();
- }
+// Lazy self-tests
+//
+// Self tests that are slow are deferred until the corresponding algorithm is
+// actually exercised, in FIPS mode. (In non-FIPS mode these tests are only run
+// when requested by |BORINGSSL_self_test|.)
- // Test whether the flag file exists.
- memcpy(flag_path, kFlagPrefix, sizeof(kFlagPrefix) - 1);
- static const char kHexTable[17] = "0123456789abcdef";
- for (size_t i = 0; i < MODULE_DIGEST_SIZE; i++) {
- flag_path[sizeof(kFlagPrefix) - 1 + 2 * i] =
- kHexTable[module_hash[i] >> 4];
- flag_path[sizeof(kFlagPrefix) - 1 + 2 * i + 1] =
- kHexTable[module_hash[i] & 15];
- }
- flag_path[sizeof(flag_path) - 1] = 0;
+static int boringssl_self_test_rsa(void) {
+ int ret = 0;
+ uint8_t output[256];
- if (access(flag_path, F_OK) == 0) {
- // Flag file found. Skip self-tests.
- return 1;
- }
+ RSA *const rsa_key = self_test_rsa_key();
+ if (rsa_key == NULL) {
+ fprintf(stderr, "RSA key construction failed\n");
+ goto err;
}
-#endif // BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
- static const uint8_t kAESKey[16] = "BoringCrypto Key";
- static const uint8_t kAESIV[16] = {0};
- static const uint8_t kPlaintext[64] =
- "BoringCryptoModule FIPS KAT Encryption and Decryption Plaintext!";
- static const uint8_t kAESCBCCiphertext[64] = {
- 0x87, 0x2d, 0x98, 0xc2, 0xcc, 0x31, 0x5b, 0x41, 0xe0, 0xfa, 0x7b,
- 0x0a, 0x71, 0xc0, 0x42, 0xbf, 0x4f, 0x61, 0xd0, 0x0d, 0x58, 0x8c,
- 0xf7, 0x05, 0xfb, 0x94, 0x89, 0xd3, 0xbc, 0xaa, 0x1a, 0x50, 0x45,
- 0x1f, 0xc3, 0x8c, 0xb8, 0x98, 0x86, 0xa3, 0xe3, 0x6c, 0xfc, 0xad,
- 0x3a, 0xb5, 0x59, 0x27, 0x7d, 0x21, 0x07, 0xca, 0x4c, 0x1d, 0x55,
- 0x34, 0xdd, 0x5a, 0x2d, 0xc4, 0xb4, 0xf5, 0xa8,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_CBC)
- 0x35
-#else
- 0x00
-#endif
+ // RSA Sign KAT
+
+ static const uint8_t kRSASignDigest[32] = {
+ 0xd2, 0xb5, 0x6e, 0x53, 0x30, 0x6f, 0x72, 0x0d, 0x79, 0x29, 0xd8,
+ 0x70, 0x8b, 0xf4, 0x6f, 0x1c, 0x22, 0x30, 0x03, 0x05, 0x58, 0x2b,
+ 0x11, 0x5b, 0xed, 0xca, 0xc7, 0x22, 0xd8, 0xaa, 0x5a, 0xb2,
};
- static const uint8_t kAESGCMCiphertext[80] = {
- 0x4a, 0xd8, 0xe7, 0x7d, 0x78, 0xd7, 0x7d, 0x5e, 0xb2, 0x11, 0xb6, 0xc9,
- 0xa4, 0xbc, 0xb2, 0xae, 0xbe, 0x93, 0xd1, 0xb7, 0xfe, 0x65, 0xc1, 0x82,
- 0x2a, 0xb6, 0x71, 0x5f, 0x1a, 0x7c, 0xe0, 0x1b, 0x2b, 0xe2, 0x53, 0xfa,
- 0xa0, 0x47, 0xfa, 0xd7, 0x8f, 0xb1, 0x4a, 0xc4, 0xdc, 0x89, 0xf9, 0xb4,
- 0x14, 0x4d, 0xde, 0x95, 0xea, 0x29, 0x69, 0x76, 0x81, 0xa3, 0x5c, 0x33,
- 0xd8, 0x37, 0xd8, 0xfa, 0x47, 0x19, 0x46, 0x2f, 0xf1, 0x90, 0xb7, 0x61,
- 0x8f, 0x6f, 0xdd, 0x31, 0x3f, 0x6a, 0x64,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_GCM)
- 0x0d
-#else
- 0x00
-#endif
+ static const uint8_t kRSASignSignature[256] = {
+ 0x64, 0xce, 0xdd, 0x91, 0x27, 0xb0, 0x4f, 0xb9, 0x14, 0xea, 0xc0, 0xb4,
+ 0xa2, 0x06, 0xc5, 0xd8, 0x40, 0x0f, 0x6c, 0x54, 0xac, 0xf7, 0x02, 0xde,
+ 0x26, 0xbb, 0xfd, 0x33, 0xe5, 0x2f, 0x4d, 0xb1, 0x53, 0xc4, 0xff, 0xd0,
+ 0x5f, 0xea, 0x15, 0x89, 0x83, 0x4c, 0xe3, 0x80, 0x0b, 0xe9, 0x13, 0x82,
+ 0x1d, 0x71, 0x92, 0x1a, 0x03, 0x60, 0x2c, 0xaf, 0xe2, 0x16, 0xc7, 0x43,
+ 0x3f, 0xde, 0x6b, 0x94, 0xfd, 0x6e, 0x08, 0x7b, 0x11, 0xf1, 0x34, 0x52,
+ 0xe5, 0xc0, 0x97, 0x66, 0x4a, 0xe0, 0x91, 0x45, 0xc8, 0xb1, 0x3d, 0x6a,
+ 0x54, 0xc1, 0x32, 0x0f, 0x32, 0xad, 0x25, 0x11, 0x3e, 0x49, 0xad, 0x41,
+ 0xce, 0x7b, 0xca, 0x95, 0x6b, 0x54, 0x5e, 0x86, 0x1b, 0xce, 0xfa, 0x2a,
+ 0x60, 0xe8, 0xfa, 0xbb, 0x23, 0xb2, 0x41, 0xbc, 0x7c, 0x98, 0xec, 0x73,
+ 0x20, 0xed, 0xb3, 0xcf, 0xab, 0x07, 0x24, 0x85, 0x6a, 0x2a, 0x61, 0x76,
+ 0x28, 0xf8, 0x00, 0x80, 0xeb, 0xd9, 0x3a, 0x63, 0xe2, 0x01, 0xb1, 0xee,
+ 0x6d, 0xe9, 0x73, 0xe9, 0xb6, 0x75, 0x2e, 0xf9, 0x81, 0xd9, 0xa8, 0x79,
+ 0xf6, 0x8f, 0xe3, 0x02, 0x7d, 0xf6, 0xea, 0xdc, 0x35, 0xe4, 0x62, 0x0d,
+ 0x91, 0xba, 0x3e, 0x7d, 0x8b, 0x82, 0xbf, 0x15, 0x74, 0x6a, 0x4e, 0x29,
+ 0xf8, 0x9b, 0x2c, 0x94, 0x8d, 0xa7, 0x00, 0x4d, 0x7b, 0xbf, 0x35, 0x07,
+ 0xeb, 0xdd, 0x10, 0xef, 0xd5, 0x2f, 0xe6, 0x98, 0x4b, 0x7e, 0x24, 0x80,
+ 0xe2, 0x01, 0xf2, 0x66, 0xb7, 0xd3, 0x93, 0xfe, 0x2a, 0xb3, 0x74, 0xed,
+ 0xec, 0x4b, 0xb1, 0x5f, 0x5f, 0xee, 0x85, 0x44, 0xa7, 0x26, 0xdf, 0xc1,
+ 0x2e, 0x7a, 0xf3, 0xa5, 0x8f, 0xf8, 0x64, 0xda, 0x65, 0xad, 0x91, 0xe2,
+ 0x90, 0x94, 0x20, 0x16, 0xb8, 0x61, 0xa5, 0x0a, 0x7d, 0xb4, 0xbf, 0xc0,
+ 0x10, 0xaf, 0x72, 0x67,
};
- static const DES_cblock kDESKey1 = {"BCMDESK1"};
- static const DES_cblock kDESKey2 = {"BCMDESK2"};
- static const DES_cblock kDESKey3 = {"BCMDESK3"};
- static const DES_cblock kDESIV = {"BCMDESIV"};
- static const uint8_t kDESCiphertext[64] = {
- 0xa4, 0x30, 0x7a, 0x4c, 0x1f, 0x60, 0x16, 0xd7, 0x4f, 0x41, 0xe1,
- 0xbb, 0x27, 0xc4, 0x27, 0x37, 0xd4, 0x7f, 0xb9, 0x10, 0xf8, 0xbc,
- 0xaf, 0x93, 0x91, 0xb8, 0x88, 0x24, 0xb1, 0xf6, 0xf8, 0xbd, 0x31,
- 0x96, 0x06, 0x76, 0xde, 0x32, 0xcd, 0x29, 0x29, 0xba, 0x70, 0x5f,
- 0xea, 0xc0, 0xcb, 0xde, 0xc7, 0x75, 0x90, 0xe0, 0x0f, 0x5e, 0x2c,
- 0x0d, 0x49, 0x20, 0xd5, 0x30, 0x83, 0xf8, 0x08,
-#if !defined(BORINGSSL_FIPS_BREAK_DES)
- 0x5a
-#else
- 0x00
-#endif
+
+ unsigned sig_len;
+ if (!rsa_sign_no_self_test(NID_sha256, kRSASignDigest, sizeof(kRSASignDigest),
+ output, &sig_len, rsa_key) ||
+ !check_test(kRSASignSignature, output, sizeof(kRSASignSignature),
+ "RSA-sign KAT")) {
+ fprintf(stderr, "RSA signing test failed.\n");
+ goto err;
+ }
+
+ // RSA Verify KAT
+
+ static const uint8_t kRSAVerifyDigest[32] = {
+ 0x09, 0x65, 0x2f, 0xd8, 0xed, 0x9d, 0xc2, 0x6d, 0xbc, 0xbf, 0xf2,
+ 0xa7, 0xa5, 0xed, 0xe1, 0x37, 0x13, 0x78, 0x21, 0x36, 0xcf, 0x8d,
+ 0x22, 0x3d, 0xab, 0x93, 0xb4, 0x12, 0xa8, 0xb5, 0x15, 0x53,
};
- static const uint8_t kPlaintextSHA1[20] = {
- 0xc6, 0xf8, 0xc9, 0x63, 0x1c, 0x14, 0x23, 0x62, 0x9b, 0xbd,
- 0x55, 0x82, 0xf4, 0xd6, 0x1d, 0xf2, 0xab, 0x7d, 0xc8,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_1)
- 0x28
-#else
- 0x00
-#endif
+ static const uint8_t kRSAVerifySignature[256] = {
+ 0xab, 0xe2, 0xcb, 0xc1, 0x3d, 0x6b, 0xd3, 0x9d, 0x48, 0xdb, 0x53, 0x34,
+ 0xdd, 0xbf, 0x8d, 0x07, 0x0a, 0x93, 0xbd, 0xcb, 0x10, 0x4e, 0x2c, 0xc5,
+ 0xd0, 0xee, 0x48, 0x6e, 0xe2, 0x95, 0xf6, 0xb3, 0x1b, 0xda, 0x12, 0x6c,
+ 0x41, 0x89, 0x0b, 0x98, 0xb7, 0x3e, 0x70, 0xe6, 0xb6, 0x5d, 0x82, 0xf9,
+ 0x5c, 0x66, 0x31, 0x21, 0x75, 0x5a, 0x90, 0x74, 0x4c, 0x8d, 0x1c, 0x21,
+ 0x14, 0x8a, 0x19, 0x60, 0xbe, 0x0e, 0xca, 0x44, 0x6e, 0x9f, 0xf4, 0x97,
+ 0xf1, 0x34, 0x5c, 0x53, 0x7e, 0xf8, 0x11, 0x9b, 0x9a, 0x43, 0x98, 0xe9,
+ 0x5c, 0x5c, 0x6d, 0xe2, 0xb1, 0xc9, 0x55, 0x90, 0x5c, 0x52, 0x99, 0xd8,
+ 0xce, 0x7a, 0x3b, 0x6a, 0xb7, 0x63, 0x80, 0xd9, 0xba, 0xbd, 0xd1, 0x5f,
+ 0x61, 0x02, 0x37, 0xe1, 0xf3, 0xf2, 0xaa, 0x1c, 0x1f, 0x1e, 0x77, 0x0b,
+ 0x62, 0xfb, 0xb5, 0x96, 0x38, 0x1b, 0x2e, 0xbd, 0xd7, 0x7e, 0xce, 0xf9,
+ 0xc9, 0x0d, 0x4c, 0x92, 0xf7, 0xb6, 0xb0, 0x5f, 0xed, 0x29, 0x36, 0x28,
+ 0x5f, 0xa9, 0x48, 0x26, 0xe6, 0x20, 0x55, 0x32, 0x2a, 0x33, 0xb6, 0xf0,
+ 0x4c, 0x74, 0xce, 0x69, 0xe5, 0xd8, 0xd7, 0x37, 0xfb, 0x83, 0x8b, 0x79,
+ 0xd2, 0xd4, 0x8e, 0x3d, 0xaf, 0x71, 0x38, 0x75, 0x31, 0x88, 0x25, 0x31,
+ 0xa9, 0x5a, 0xc9, 0x64, 0xd0, 0x2e, 0xa4, 0x13, 0xbf, 0x85, 0x95, 0x29,
+ 0x82, 0xbb, 0xc0, 0x89, 0x52, 0x7d, 0xaf, 0xf5, 0xb8, 0x45, 0xc9, 0xa0,
+ 0xf4, 0xd1, 0x4e, 0xf1, 0x95, 0x6d, 0x9c, 0x3a, 0xca, 0xe8, 0x82, 0xd1,
+ 0x2d, 0xa6, 0x6d, 0xa0, 0xf3, 0x57, 0x94, 0xf5, 0xee, 0x32, 0x23, 0x23,
+ 0x33, 0x51, 0x7d, 0xb9, 0x31, 0x52, 0x32, 0xa1, 0x83, 0xb9, 0x91, 0x65,
+ 0x4d, 0xbe, 0xa4, 0x16, 0x15, 0x34, 0x5c, 0x88, 0x53, 0x25, 0x92, 0x67,
+ 0x44, 0xa5, 0x39, 0x15,
};
- static const uint8_t kPlaintextSHA256[32] = {
- 0x37, 0xbd, 0x70, 0x53, 0x72, 0xfc, 0xd4, 0x03, 0x79, 0x70, 0xfb,
- 0x06, 0x95, 0xb1, 0x2a, 0x82, 0x48, 0xe1, 0x3e, 0xf2, 0x33, 0xfb,
- 0xef, 0x29, 0x81, 0x22, 0x45, 0x40, 0x43, 0x70, 0xce,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_256)
- 0x0f
-#else
- 0x00
-#endif
+ if (!rsa_verify_no_self_test(NID_sha256, kRSAVerifyDigest,
+ sizeof(kRSAVerifyDigest), kRSAVerifySignature,
+ sizeof(kRSAVerifySignature), rsa_key)) {
+ fprintf(stderr, "RSA-verify KAT failed.\n");
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ RSA_free(rsa_key);
+
+ return ret;
+}
+
+static int boringssl_self_test_ecc(void) {
+ int ret = 0;
+ EC_KEY *ec_key = NULL;
+ EC_GROUP *ec_group = NULL;
+ EC_POINT *ec_point_in = NULL;
+ EC_POINT *ec_point_out = NULL;
+ BIGNUM *ec_scalar = NULL;
+ ECDSA_SIG *sig = NULL;
+
+ ec_key = self_test_ecdsa_key();
+ if (ec_key == NULL) {
+ fprintf(stderr, "ECDSA KeyGen failed\n");
+ goto err;
+ }
+
+ // ECDSA Sign/Verify KAT
+
+ static const uint8_t kECDSASignDigest[32] = {
+ 0x1e, 0x35, 0x93, 0x0b, 0xe8, 0x60, 0xd0, 0x94, 0x2c, 0xa7, 0xbb,
+ 0xd6, 0xf6, 0xde, 0xd8, 0x7f, 0x15, 0x7e, 0x4d, 0xe2, 0x4f, 0x81,
+ 0xed, 0x4b, 0x87, 0x5c, 0x0e, 0x01, 0x8e, 0x89, 0xa8, 0x1f,
};
- static const uint8_t kPlaintextSHA512[64] = {
- 0x08, 0x6a, 0x1c, 0x84, 0x61, 0x9d, 0x8e, 0xb3, 0xc0, 0x97, 0x4e,
- 0xa1, 0x9f, 0x9c, 0xdc, 0xaf, 0x3b, 0x5c, 0x31, 0xf0, 0xf2, 0x74,
- 0xc3, 0xbd, 0x6e, 0xd6, 0x1e, 0xb2, 0xbb, 0x34, 0x74, 0x72, 0x5c,
- 0x51, 0x29, 0x8b, 0x87, 0x3a, 0xa3, 0xf2, 0x25, 0x23, 0xd4, 0x1c,
- 0x82, 0x1b, 0xfe, 0xd3, 0xc6, 0xee, 0xb5, 0xd6, 0xaf, 0x07, 0x7b,
- 0x98, 0xca, 0xa7, 0x01, 0xf3, 0x94, 0xf3, 0x68,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_512)
- 0x14
-#else
- 0x00
-#endif
- };
- static const uint8_t kRSASignature[256] = {
- 0x62, 0x66, 0x4b, 0xe3, 0xb1, 0xd2, 0x83, 0xf1, 0xa8, 0x56, 0x2b, 0x33,
- 0x60, 0x1e, 0xdb, 0x1e, 0x06, 0xf7, 0xa7, 0x1e, 0xa8, 0xef, 0x03, 0x4d,
- 0x0c, 0xf6, 0x83, 0x75, 0x7a, 0xf0, 0x14, 0xc7, 0xe2, 0x94, 0x3a, 0xb5,
- 0x67, 0x56, 0xa5, 0x48, 0x7f, 0x3a, 0xa5, 0xbf, 0xf7, 0x1d, 0x44, 0xa6,
- 0x34, 0xed, 0x9b, 0xd6, 0x51, 0xaa, 0x2c, 0x4e, 0xce, 0x60, 0x5f, 0xe9,
- 0x0e, 0xd5, 0xcd, 0xeb, 0x23, 0x27, 0xf8, 0xfb, 0x45, 0xe5, 0x34, 0x63,
- 0x77, 0x7f, 0x2e, 0x80, 0xcf, 0x9d, 0x2e, 0xfc, 0xe2, 0x50, 0x75, 0x29,
- 0x46, 0xf4, 0xaf, 0x91, 0xed, 0x36, 0xe1, 0x5e, 0xef, 0x66, 0xa1, 0xff,
- 0x27, 0xfc, 0x87, 0x7e, 0x60, 0x84, 0x0f, 0x54, 0x51, 0x56, 0x0f, 0x68,
- 0x99, 0xc0, 0x3f, 0xeb, 0xa5, 0xa0, 0x46, 0xb0, 0x86, 0x02, 0xb0, 0xc8,
- 0xe8, 0x46, 0x13, 0x06, 0xcd, 0xb7, 0x8a, 0xd0, 0x3b, 0x46, 0xd0, 0x14,
- 0x64, 0x53, 0x9b, 0x5b, 0x5e, 0x02, 0x45, 0xba, 0x6e, 0x7e, 0x0a, 0xb9,
- 0x9e, 0x62, 0xb7, 0xd5, 0x7a, 0x87, 0xea, 0xd3, 0x24, 0xa5, 0xef, 0xb3,
- 0xdc, 0x05, 0x9c, 0x04, 0x60, 0x4b, 0xde, 0xa8, 0x90, 0x08, 0x7b, 0x6a,
- 0x5f, 0xb4, 0x3f, 0xda, 0xc5, 0x1f, 0x6e, 0xd6, 0x15, 0xde, 0x65, 0xa4,
- 0x6e, 0x62, 0x9d, 0x8f, 0xa8, 0xbe, 0x86, 0xf6, 0x09, 0x90, 0x40, 0xa5,
- 0xf4, 0x23, 0xc5, 0xf6, 0x38, 0x86, 0x0d, 0x1c, 0xed, 0x4a, 0x0a, 0xae,
- 0xa4, 0x26, 0xc2, 0x2e, 0xd3, 0x13, 0x66, 0x61, 0xea, 0x35, 0x01, 0x0e,
- 0x13, 0xda, 0x78, 0x20, 0xae, 0x59, 0x5f, 0x9b, 0xa9, 0x6c, 0xf9, 0x1b,
- 0xdf, 0x76, 0x53, 0xc8, 0xa7, 0xf5, 0x63, 0x6d, 0xf3, 0xff, 0xfd, 0xaf,
- 0x75, 0x4b, 0xac, 0x67, 0xb1, 0x3c, 0xbf, 0x5e, 0xde, 0x73, 0x02, 0x6d,
- 0xd2, 0x0c, 0xb1,
-#if !defined(BORINGSSL_FIPS_BREAK_RSA_SIG)
- 0x64
-#else
- 0x00
-#endif
- };
- const uint8_t kDRBGEntropy[48] =
- "BCM Known Answer Test DBRG Initial Entropy ";
- const uint8_t kDRBGPersonalization[18] = "BCMPersonalization";
- const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD ";
- const uint8_t kDRBGOutput[64] = {
- 0x1d, 0x63, 0xdf, 0x05, 0x51, 0x49, 0x22, 0x46, 0xcd, 0x9b, 0xc5,
- 0xbb, 0xf1, 0x5d, 0x44, 0xae, 0x13, 0x78, 0xb1, 0xe4, 0x7c, 0xf1,
- 0x96, 0x33, 0x3d, 0x60, 0xb6, 0x29, 0xd4, 0xbb, 0x6b, 0x44, 0xf9,
- 0xef, 0xd9, 0xf4, 0xa2, 0xba, 0x48, 0xea, 0x39, 0x75, 0x59, 0x32,
- 0xf7, 0x31, 0x2c, 0x98, 0x14, 0x2b, 0x49, 0xdf, 0x02, 0xb6, 0x5d,
- 0x71, 0x09, 0x50, 0xdb, 0x23, 0xdb, 0xe5, 0x22,
-#if !defined(BORINGSSL_FIPS_BREAK_DRBG)
- 0x95
-#else
- 0x00
-#endif
- };
- const uint8_t kDRBGEntropy2[48] =
- "BCM Known Answer Test DBRG Reseed Entropy ";
- const uint8_t kDRBGReseedOutput[64] = {
- 0xa4, 0x77, 0x05, 0xdb, 0x14, 0x11, 0x76, 0x71, 0x42, 0x5b, 0xd8,
- 0xd7, 0xa5, 0x4f, 0x8b, 0x39, 0xf2, 0x10, 0x4a, 0x50, 0x5b, 0xa2,
- 0xc8, 0xf0, 0xbb, 0x3e, 0xa1, 0xa5, 0x90, 0x7d, 0x54, 0xd9, 0xc6,
- 0xb0, 0x96, 0xc0, 0x2b, 0x7e, 0x9b, 0xc9, 0xa1, 0xdd, 0x78, 0x2e,
- 0xd5, 0xa8, 0x66, 0x16, 0xbd, 0x18, 0x3c, 0xf2, 0xaa, 0x7a, 0x2b,
- 0x37, 0xf9, 0xab, 0x35, 0x64, 0x15, 0x01, 0x3f, 0xc4,
- };
- const uint8_t kECDSASigR[32] = {
+ static const uint8_t kECDSASignSig[64] = {
0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0,
0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02,
- 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5,
-#if !defined(BORINGSSL_FIPS_BREAK_ECDSA_SIG)
- 0x0c,
-#else
- 0x00,
-#endif
+ 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x68,
+ 0x04, 0x73, 0x40, 0x94, 0xb2, 0xd1, 0x90, 0xac, 0x2d, 0x0c, 0xd7,
+ 0xa5, 0x7f, 0x2f, 0x2e, 0xb2, 0x62, 0xb0, 0x09, 0x16, 0xe1, 0xa6,
+ 0x70, 0xb5, 0xbb, 0x0d, 0xfd, 0x8e, 0x0c, 0x02, 0x3f,
};
- const uint8_t kECDSASigS[32] = {
- 0xa5, 0x93, 0xe0, 0x23, 0x91, 0xe7, 0x4b, 0x8d, 0x77, 0x25, 0xa6,
- 0xba, 0x4d, 0xd9, 0x86, 0x77, 0xda, 0x7d, 0x8f, 0xef, 0xc4, 0x1a,
- 0xf0, 0xcc, 0x81, 0xe5, 0xea, 0x3f, 0xc2, 0x41, 0x7f, 0xd8,
+
+ // The 'k' value for ECDSA is fixed to avoid an entropy draw.
+ uint8_t ecdsa_k[32] = {0};
+ ecdsa_k[31] = 42;
+
+ sig = ecdsa_sign_with_nonce_for_known_answer_test(
+ kECDSASignDigest, sizeof(kECDSASignDigest), ec_key, ecdsa_k,
+ sizeof(ecdsa_k));
+
+ uint8_t ecdsa_sign_output[64];
+ if (sig == NULL ||
+ !serialize_ecdsa_sig(ecdsa_sign_output, sizeof(ecdsa_sign_output), sig) ||
+ !check_test(kECDSASignSig, ecdsa_sign_output, sizeof(ecdsa_sign_output),
+ "ECDSA-sign signature")) {
+ fprintf(stderr, "ECDSA-sign KAT failed.\n");
+ goto err;
+ }
+
+ static const uint8_t kECDSAVerifyDigest[32] = {
+ 0x78, 0x7c, 0x50, 0x5c, 0x60, 0xc9, 0xe4, 0x13, 0x6c, 0xe4, 0x48,
+ 0xba, 0x93, 0xff, 0x71, 0xfa, 0x9c, 0x18, 0xf4, 0x17, 0x09, 0x4f,
+ 0xdf, 0x5a, 0xe2, 0x75, 0xc0, 0xcc, 0xd2, 0x67, 0x97, 0xad,
};
+ static const uint8_t kECDSAVerifySig[64] = {
+ 0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0,
+ 0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02,
+ 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x2d,
+ 0x36, 0xe5, 0x79, 0x97, 0x90, 0xbf, 0xbe, 0x21, 0x83, 0xd3, 0x3e,
+ 0x96, 0xf3, 0xc5, 0x1f, 0x6a, 0x23, 0x2f, 0x2a, 0x24, 0x48, 0x8c,
+ 0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29,
+ };
+
+ ECDSA_SIG_free(sig);
+ sig = parse_ecdsa_sig(kECDSAVerifySig, sizeof(kECDSAVerifySig));
+ if (!sig ||
+ !ecdsa_do_verify_no_self_test(kECDSAVerifyDigest,
+ sizeof(kECDSAVerifyDigest), sig, ec_key)) {
+ fprintf(stderr, "ECDSA-verify KAT failed.\n");
+ goto err;
+ }
+
+ // Primitive Z Computation KAT (IG 9.6).
+
// kP256Point is SHA256("Primitive Z Computation KAT")×G within P-256.
- const uint8_t kP256Point[65] = {
+ static const uint8_t kP256Point[65] = {
0x04, 0x4e, 0xc1, 0x94, 0x8c, 0x5c, 0xf4, 0x37, 0x35, 0x0d, 0xa3,
0xf9, 0x55, 0xf9, 0x8b, 0x26, 0x23, 0x5c, 0x43, 0xe0, 0x83, 0x51,
0x2b, 0x0d, 0x4b, 0x56, 0x24, 0xc3, 0xe4, 0xa5, 0xa8, 0xe2, 0xe9,
@@ -486,49 +484,63 @@
0x79, 0x93, 0x7c, 0x0b, 0x92, 0x2b, 0x7f, 0x17, 0xa5, 0x80,
};
// kP256Scalar is SHA256("Primitive Z Computation KAT scalar").
- const uint8_t kP256Scalar[32] = {
+ static const uint8_t kP256Scalar[32] = {
0xe7, 0x60, 0x44, 0x91, 0x26, 0x9a, 0xfb, 0x5b, 0x10, 0x2d, 0x6e,
0xa5, 0x2c, 0xb5, 0x9f, 0xeb, 0x70, 0xae, 0xde, 0x6c, 0xe3, 0xbf,
0xb3, 0xe0, 0x10, 0x54, 0x85, 0xab, 0xd8, 0x61, 0xd7, 0x7b,
};
// kP256PointResult is |kP256Scalar|×|kP256Point|.
- const uint8_t kP256PointResult[65] = {
+ static const uint8_t kP256PointResult[65] = {
0x04, 0xf1, 0x63, 0x00, 0x88, 0xc5, 0xd5, 0xe9, 0x05, 0x52, 0xac,
0xb6, 0xec, 0x68, 0x76, 0xb8, 0x73, 0x7f, 0x0f, 0x72, 0x34, 0xe6,
0xbb, 0x30, 0x32, 0x22, 0x37, 0xb6, 0x2a, 0x80, 0xe8, 0x9e, 0x6e,
0x6f, 0x36, 0x02, 0xe7, 0x21, 0xd2, 0x31, 0xdb, 0x94, 0x63, 0xb7,
0xd8, 0x19, 0x0e, 0xc2, 0xc0, 0xa7, 0x2f, 0x15, 0x49, 0x1a, 0xa2,
- 0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a,
-#if !defined(BORINGSSL_FIPS_BREAK_Z_COMPUTATION)
- 0x0c,
-#else
- 0x00,
-#endif
+ 0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a, 0x0c,
};
- const uint8_t kTLSOutput[32] = {
- 0x67, 0x85, 0xde, 0x60, 0xfc, 0x0a, 0x83, 0xe9, 0xa2, 0x2a, 0xb3,
- 0xf0, 0x27, 0x0c, 0xba, 0xf7, 0xfa, 0x82, 0x3d, 0x14, 0x77, 0x1d,
- 0x86, 0x29, 0x79, 0x39, 0x77, 0x8a, 0xd5, 0x0e, 0x9d,
-#if !defined(BORINGSSL_FIPS_BREAK_TLS_KDF)
- 0x32,
-#else
- 0x00,
-#endif
- };
- const uint8_t kTLSSecret[32] = {
- 0xbf, 0xe4, 0xb7, 0xe0, 0x26, 0x55, 0x5f, 0x6a, 0xdf, 0x5d, 0x27,
- 0xd6, 0x89, 0x99, 0x2a, 0xd6, 0xf7, 0x65, 0x66, 0x07, 0x4b, 0x55,
- 0x5f, 0x64, 0x55, 0xcd, 0xd5, 0x77, 0xa4, 0xc7, 0x09, 0x61,
- };
- const char kTLSLabel[] = "FIPS self test";
- const uint8_t kTLSSeed1[16] = {
- 0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2,
- 0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65,
- };
- const uint8_t kTLSSeed2[16] = {
- 0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c,
- 0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81,
- };
+
+ ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
+ if (ec_group == NULL) {
+ fprintf(stderr, "Failed to create P-256 group.\n");
+ goto err;
+ }
+ ec_point_in = EC_POINT_new(ec_group);
+ ec_point_out = EC_POINT_new(ec_group);
+ ec_scalar = BN_new();
+ uint8_t z_comp_result[65];
+ if (ec_point_in == NULL || ec_point_out == NULL || ec_scalar == NULL ||
+ !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point),
+ NULL) ||
+ !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) ||
+ !ec_point_mul_no_self_test(ec_group, ec_point_out, NULL, ec_point_in,
+ ec_scalar, NULL) ||
+ !EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED,
+ z_comp_result, sizeof(z_comp_result), NULL) ||
+ !check_test(kP256PointResult, z_comp_result, sizeof(z_comp_result),
+ "Z Computation Result")) {
+ fprintf(stderr, "Z-computation KAT failed.\n");
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ EC_KEY_free(ec_key);
+ EC_POINT_free(ec_point_in);
+ EC_POINT_free(ec_point_out);
+ EC_GROUP_free(ec_group);
+ BN_free(ec_scalar);
+ ECDSA_SIG_free(sig);
+
+ return ret;
+}
+
+static int boringssl_self_test_ffdh(void) {
+ int ret = 0;
+ DH *dh = NULL;
+ BIGNUM *ffdhe2048_value = NULL;
+
+ // FFC Diffie-Hellman KAT
// kFFDHE2048PublicValueData is an arbitrary public value, mod
// kFFDHE2048Data. (The private key happens to be 4096.)
@@ -550,8 +562,7 @@
TOBN(0xbae7b0b3, 0x6e362dc0), TOBN(0xa57c73bd, 0xdc70fb82),
TOBN(0xfaff50d2, 0x9d573457), TOBN(0x352bd399, 0xbe84058e),
};
-
- const uint8_t kDHOutput[2048 / 8] = {
+ static const uint8_t kDHOutput[2048 / 8] = {
0x2a, 0xe6, 0xd3, 0xa6, 0x13, 0x58, 0x8e, 0xce, 0x53, 0xaa, 0xf6, 0x5d,
0x9a, 0xae, 0x02, 0x12, 0xf5, 0x80, 0x3d, 0x06, 0x09, 0x76, 0xac, 0x57,
0x37, 0x9e, 0xab, 0x38, 0x62, 0x25, 0x05, 0x1d, 0xf3, 0xa9, 0x39, 0x60,
@@ -573,23 +584,144 @@
0x06, 0x80, 0x2a, 0x4e, 0x5a, 0xf0, 0x1e, 0xaa, 0xcb, 0xab, 0x06, 0x0e,
0x27, 0x0f, 0xd9, 0x88, 0xd9, 0x01, 0xe3, 0x07, 0xeb, 0xdf, 0xc3, 0x12,
0xe3, 0x40, 0x88, 0x7b, 0x5f, 0x59, 0x78, 0x6e, 0x26, 0x20, 0xc3, 0xdf,
- 0xc8, 0xe4, 0x5e,
-#if !defined(BORINGSSL_FIPS_BREAK_FFC_DH)
- 0xb8,
-#else
- 0x00,
-#endif
+ 0xc8, 0xe4, 0x5e, 0xb8,
};
+ ffdhe2048_value = BN_new();
+ if (ffdhe2048_value) {
+ bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData,
+ OPENSSL_ARRAY_SIZE(kFFDHE2048PublicValueData));
+ }
+
+ dh = self_test_dh();
+ uint8_t dh_out[sizeof(kDHOutput)];
+ if (dh == NULL || ffdhe2048_value == NULL || sizeof(dh_out) != DH_size(dh) ||
+ dh_compute_key_padded_no_self_test(dh_out, ffdhe2048_value, dh) !=
+ sizeof(dh_out) ||
+ !check_test(kDHOutput, dh_out, sizeof(dh_out), "FFC DH")) {
+ fprintf(stderr, "FFDH failed.\n");
+ goto err;
+ }
+
+ ret = 1;
+
+err:
+ DH_free(dh);
+ BN_free(ffdhe2048_value);
+
+ return ret;
+}
+
+#if defined(BORINGSSL_FIPS)
+
+static void run_self_test_rsa(void) {
+ if (!boringssl_self_test_rsa()) {
+ BORINGSSL_FIPS_abort();
+ }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_rsa);
+
+void boringssl_ensure_rsa_self_test(void) {
+ CRYPTO_once(g_self_test_once_rsa_bss_get(), run_self_test_rsa);
+}
+
+static void run_self_test_ecc(void) {
+ if (!boringssl_self_test_ecc()) {
+ BORINGSSL_FIPS_abort();
+ }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_ecc);
+
+void boringssl_ensure_ecc_self_test(void) {
+ CRYPTO_once(g_self_test_once_ecc_bss_get(), run_self_test_ecc);
+}
+
+static void run_self_test_ffdh(void) {
+ if (!boringssl_self_test_ffdh()) {
+ BORINGSSL_FIPS_abort();
+ }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_ffdh);
+
+void boringssl_ensure_ffdh_self_test(void) {
+ CRYPTO_once(g_self_test_once_ffdh_bss_get(), run_self_test_ffdh);
+}
+
+#endif // BORINGSSL_FIPS
+
+
+// Startup self tests.
+//
+// These tests are run at process start when in FIPS mode.
+
+int boringssl_self_test_sha256(void) {
+ static const uint8_t kInput[16] = {
+ 0xff, 0x3b, 0x85, 0x7d, 0xa7, 0x23, 0x6a, 0x2b,
+ 0xaa, 0x0f, 0x39, 0x6b, 0x51, 0x52, 0x22, 0x17,
+ };
+ static const uint8_t kPlaintextSHA256[32] = {
+ 0x7f, 0xe4, 0xd5, 0xf1, 0xa1, 0xe3, 0x82, 0x87, 0xd9, 0x58, 0xf5,
+ 0x11, 0xc7, 0x1d, 0x5e, 0x27, 0x5e, 0xcc, 0xd2, 0x66, 0xcf, 0xb9,
+ 0xc8, 0xc6, 0x60, 0xd8, 0x92, 0x1e, 0x57, 0xfd, 0x46, 0x75,
+ };
+ uint8_t output[SHA256_DIGEST_LENGTH];
+
+ // SHA-256 KAT
+ SHA256(kInput, sizeof(kInput), output);
+ return check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256),
+ "SHA-256 KAT");
+}
+
+int boringssl_self_test_sha512(void) {
+ static const uint8_t kInput[16] = {
+ 0x21, 0x25, 0x12, 0xf8, 0xd2, 0xad, 0x83, 0x22,
+ 0x78, 0x1c, 0x6c, 0x4d, 0x69, 0xa9, 0xda, 0xa1,
+ };
+ static const uint8_t kPlaintextSHA512[64] = {
+ 0x29, 0x3c, 0x94, 0x35, 0x4e, 0x98, 0x83, 0xe5, 0xc2, 0x78, 0x36,
+ 0x7a, 0xe5, 0x18, 0x90, 0xbf, 0x35, 0x41, 0x01, 0x64, 0x19, 0x8d,
+ 0x26, 0xeb, 0xe1, 0xf8, 0x2f, 0x04, 0x8e, 0xfa, 0x8b, 0x2b, 0xc6,
+ 0xb2, 0x9d, 0x5d, 0x46, 0x76, 0x5a, 0xc8, 0xb5, 0x25, 0xa3, 0xea,
+ 0x52, 0x84, 0x47, 0x6d, 0x6d, 0xf4, 0xc9, 0x71, 0xf3, 0x3d, 0x89,
+ 0x4c, 0x3b, 0x20, 0x8c, 0x5b, 0x75, 0xe8, 0xf8, 0x7c,
+ };
+ uint8_t output[SHA512_DIGEST_LENGTH];
+
+ // SHA-512 KAT
+ SHA512(kInput, sizeof(kInput), output);
+ return check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512),
+ "SHA-512 KAT");
+}
+
+int boringssl_self_test_hmac_sha256(void) {
+ static const uint8_t kInput[16] = {
+ 0xda, 0xd9, 0x12, 0x93, 0xdf, 0xcf, 0x2a, 0x7c,
+ 0x8e, 0xcd, 0x13, 0xfe, 0x35, 0x3f, 0xa7, 0x5b,
+ };
+ static const uint8_t kPlaintextHMACSHA256[32] = {
+ 0x36, 0x5f, 0x5b, 0xd5, 0xf5, 0xeb, 0xfd, 0xc7, 0x6e, 0x53, 0xa5,
+ 0x73, 0x6d, 0x73, 0x20, 0x13, 0xaa, 0xd3, 0xbc, 0x86, 0x4b, 0xb8,
+ 0x84, 0x94, 0x16, 0x46, 0x88, 0x9c, 0x48, 0xee, 0xa9, 0x0e,
+ };
+ uint8_t output[EVP_MAX_MD_SIZE];
+
+ unsigned output_len;
+ HMAC(EVP_sha256(), kInput, sizeof(kInput), kInput, sizeof(kInput), output,
+ &output_len);
+ return output_len == sizeof(kPlaintextHMACSHA256) &&
+ check_test(kPlaintextHMACSHA256, output, sizeof(kPlaintextHMACSHA256),
+ "HMAC-SHA-256 KAT");
+}
+
+static int boringssl_self_test_fast(void) {
+ static const uint8_t kAESKey[16] = "BoringCrypto Key";
+ static const uint8_t kAESIV[16] = {0};
+
EVP_AEAD_CTX aead_ctx;
EVP_AEAD_CTX_zero(&aead_ctx);
- RSA *rsa_key = NULL;
- EC_KEY *ec_key = NULL;
- EC_GROUP *ec_group = NULL;
- EC_POINT *ec_point_in = NULL;
- EC_POINT *ec_point_out = NULL;
- BIGNUM *ec_scalar = NULL;
- ECDSA_SIG *sig = NULL;
int ret = 0;
AES_KEY aes_key;
@@ -597,28 +729,48 @@
uint8_t output[256];
// AES-CBC Encryption KAT
+ static const uint8_t kAESCBCEncPlaintext[32] = {
+ 0x07, 0x86, 0x09, 0xa6, 0xc5, 0xac, 0x25, 0x44, 0x69, 0x9a, 0xdf,
+ 0x68, 0x2f, 0xa3, 0x77, 0xf9, 0xbe, 0x8a, 0xb6, 0xae, 0xf5, 0x63,
+ 0xe8, 0xc5, 0x6a, 0x36, 0xb8, 0x4f, 0x55, 0x7f, 0xad, 0xd3,
+ };
+ static const uint8_t kAESCBCEncCiphertext[sizeof(kAESCBCEncPlaintext)] = {
+ 0x56, 0x46, 0xc1, 0x41, 0xf4, 0x13, 0xd6, 0xff, 0x62, 0x92, 0x41,
+ 0x7a, 0x26, 0xc6, 0x86, 0xbd, 0x30, 0x5f, 0xb6, 0x57, 0xa7, 0xd2,
+ 0x50, 0x3a, 0xc5, 0x5e, 0x8e, 0x93, 0x40, 0xf2, 0x10, 0xd8,
+ };
memcpy(aes_iv, kAESIV, sizeof(kAESIV));
if (AES_set_encrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
fprintf(stderr, "AES_set_encrypt_key failed.\n");
goto err;
}
- AES_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &aes_key, aes_iv,
- AES_ENCRYPT);
- if (!check_test(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
- "AES-CBC Encryption KAT")) {
+ AES_cbc_encrypt(kAESCBCEncPlaintext, output, sizeof(kAESCBCEncPlaintext),
+ &aes_key, aes_iv, AES_ENCRYPT);
+ if (!check_test(kAESCBCEncCiphertext, output, sizeof(kAESCBCEncCiphertext),
+ "AES-CBC-encrypt KAT")) {
goto err;
}
// AES-CBC Decryption KAT
+ static const uint8_t kAESCBCDecCiphertext[32] = {
+ 0x34, 0x7a, 0xa5, 0xa0, 0x24, 0xb2, 0x82, 0x57, 0xb3, 0x65, 0x10,
+ 0xbe, 0x58, 0x3d, 0x4f, 0x47, 0xad, 0xb7, 0xbb, 0xee, 0xdc, 0x60,
+ 0x05, 0xbb, 0xbd, 0x0d, 0x0a, 0x9f, 0x06, 0xbb, 0x7b, 0x10,
+ };
+ static const uint8_t kAESCBCDecPlaintext[sizeof(kAESCBCDecCiphertext)] = {
+ 0x51, 0xa7, 0xa0, 0x1f, 0x6b, 0x79, 0x6c, 0xcd, 0x48, 0x03, 0xa1,
+ 0x41, 0xdc, 0x56, 0xa6, 0xc2, 0x16, 0xb5, 0xd1, 0xd3, 0xb7, 0x06,
+ 0xb2, 0x25, 0x6f, 0xa6, 0xd0, 0xd2, 0x0e, 0x6f, 0x19, 0xb5,
+ };
memcpy(aes_iv, kAESIV, sizeof(kAESIV));
if (AES_set_decrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
fprintf(stderr, "AES_set_decrypt_key failed.\n");
goto err;
}
- AES_cbc_encrypt(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
+ AES_cbc_encrypt(kAESCBCDecCiphertext, output, sizeof(kAESCBCDecCiphertext),
&aes_key, aes_iv, AES_DECRYPT);
- if (!check_test(kPlaintext, output, sizeof(kPlaintext),
- "AES-CBC Decryption KAT")) {
+ if (!check_test(kAESCBCDecPlaintext, output, sizeof(kAESCBCDecPlaintext),
+ "AES-CBC-decrypt KAT")) {
goto err;
}
@@ -632,194 +784,115 @@
}
// AES-GCM Encryption KAT
+ static const uint8_t kAESGCMEncPlaintext[32] = {
+ 0x8f, 0xcc, 0x40, 0x99, 0x80, 0x8e, 0x75, 0xca, 0xaf, 0xf5, 0x82,
+ 0x89, 0x88, 0x48, 0xa8, 0x8d, 0x80, 0x8b, 0x55, 0xab, 0x4e, 0x93,
+ 0x70, 0x79, 0x7d, 0x94, 0x0b, 0xe8, 0xcc, 0x1d, 0x78, 0x84,
+ };
+ static const uint8_t kAESGCMCiphertext[sizeof(kAESGCMEncPlaintext) + 16] = {
+ 0x87, 0x7b, 0xd5, 0x8d, 0x96, 0x3e, 0x4b, 0xe6, 0x64, 0x94, 0x40, 0x2f,
+ 0x61, 0x9b, 0x7e, 0x56, 0x52, 0x7d, 0xa4, 0x5a, 0xf9, 0xa6, 0xe2, 0xdb,
+ 0x1c, 0x63, 0x2e, 0x97, 0x93, 0x0f, 0xfb, 0xed, 0xb5, 0x9e, 0x1c, 0x20,
+ 0xb2, 0xb0, 0x58, 0xda, 0x48, 0x07, 0x2d, 0xbd, 0x96, 0x0d, 0x34, 0xc6,
+ };
if (!EVP_AEAD_CTX_seal(&aead_ctx, output, &out_len, sizeof(output), nonce,
EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
- kPlaintext, sizeof(kPlaintext), NULL, 0) ||
+ kAESGCMEncPlaintext, sizeof(kAESGCMEncPlaintext), NULL,
+ 0) ||
!check_test(kAESGCMCiphertext, output, sizeof(kAESGCMCiphertext),
- "AES-GCM Encryption KAT")) {
+ "AES-GCM-encrypt KAT")) {
fprintf(stderr, "EVP_AEAD_CTX_seal for AES-128-GCM failed.\n");
goto err;
}
// AES-GCM Decryption KAT
+ static const uint8_t kAESGCMDecCiphertext[48] = {
+ 0x35, 0xf3, 0x05, 0x8f, 0x87, 0x57, 0x60, 0xff, 0x09, 0xd3, 0x12, 0x0f,
+ 0x70, 0xc4, 0xbc, 0x9e, 0xd7, 0xa8, 0x68, 0x72, 0xe1, 0x34, 0x52, 0x20,
+ 0x21, 0x76, 0xf7, 0x37, 0x1a, 0xe0, 0x4f, 0xaa, 0xe1, 0xdd, 0x39, 0x19,
+ 0x20, 0xf5, 0xd1, 0x39, 0x53, 0xd8, 0x96, 0x78, 0x59, 0x94, 0x82, 0x3c,
+ };
+ static const uint8_t kAESGCMDecPlaintext[sizeof(kAESGCMDecCiphertext) - 16] =
+ {
+ 0x3d, 0x44, 0x90, 0x9b, 0x91, 0xe7, 0x5e, 0xd3, 0xc2, 0xb2, 0xd0,
+ 0xa9, 0x99, 0x17, 0x6a, 0x45, 0x05, 0x5e, 0x99, 0x83, 0x56, 0x01,
+ 0xc0, 0x82, 0x40, 0x81, 0xd2, 0x48, 0x45, 0xf2, 0xcc, 0xc3,
+ };
if (!EVP_AEAD_CTX_open(&aead_ctx, output, &out_len, sizeof(output), nonce,
EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
- kAESGCMCiphertext, sizeof(kAESGCMCiphertext), NULL,
- 0) ||
- !check_test(kPlaintext, output, sizeof(kPlaintext),
- "AES-GCM Decryption KAT")) {
- fprintf(stderr, "EVP_AEAD_CTX_open for AES-128-GCM failed.\n");
- goto err;
- }
-
- DES_key_schedule des1, des2, des3;
- DES_cblock des_iv;
- DES_set_key(&kDESKey1, &des1);
- DES_set_key(&kDESKey2, &des2);
- DES_set_key(&kDESKey3, &des3);
-
- // 3DES Encryption KAT
- memcpy(&des_iv, &kDESIV, sizeof(des_iv));
- DES_ede3_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &des1, &des2,
- &des3, &des_iv, DES_ENCRYPT);
- if (!check_test(kDESCiphertext, output, sizeof(kDESCiphertext),
- "3DES Encryption KAT")) {
- goto err;
- }
-
- // 3DES Decryption KAT
- memcpy(&des_iv, &kDESIV, sizeof(des_iv));
- DES_ede3_cbc_encrypt(kDESCiphertext, output, sizeof(kDESCiphertext), &des1,
- &des2, &des3, &des_iv, DES_DECRYPT);
- if (!check_test(kPlaintext, output, sizeof(kPlaintext),
- "3DES Decryption KAT")) {
+ kAESGCMDecCiphertext, sizeof(kAESGCMDecCiphertext),
+ NULL, 0) ||
+ !check_test(kAESGCMDecPlaintext, output, sizeof(kAESGCMDecPlaintext),
+ "AES-GCM-decrypt KAT")) {
+ fprintf(stderr,
+ "AES-GCM-decrypt KAT failed because EVP_AEAD_CTX_open failed.\n");
goto err;
}
// SHA-1 KAT
- SHA1(kPlaintext, sizeof(kPlaintext), output);
- if (!check_test(kPlaintextSHA1, output, sizeof(kPlaintextSHA1),
+ static const uint8_t kSHA1Input[16] = {
+ 0x13, 0x2f, 0xd9, 0xba, 0xd5, 0xc1, 0x82, 0x62,
+ 0x63, 0xba, 0xfb, 0xb6, 0x99, 0xf7, 0x07, 0xa5,
+ };
+ static const uint8_t kSHA1Digest[20] = {
+ 0x94, 0x19, 0x55, 0x93, 0x0a, 0x58, 0x29, 0x38, 0xeb, 0xf5,
+ 0x09, 0x11, 0x6d, 0x1a, 0xfd, 0x0f, 0x1e, 0x11, 0xe3, 0xcb,
+ };
+ SHA1(kSHA1Input, sizeof(kSHA1Input), output);
+ if (!check_test(kSHA1Digest, output, sizeof(kSHA1Digest),
"SHA-1 KAT")) {
goto err;
}
- // SHA-256 KAT
- SHA256(kPlaintext, sizeof(kPlaintext), output);
- if (!check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256),
- "SHA-256 KAT")) {
- goto err;
- }
-
- // SHA-512 KAT
- SHA512(kPlaintext, sizeof(kPlaintext), output);
- if (!check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512),
- "SHA-512 KAT")) {
- goto err;
- }
-
- rsa_key = self_test_rsa_key();
- if (rsa_key == NULL) {
- fprintf(stderr, "RSA KeyGen failed\n");
- goto err;
- }
-
- // RSA Sign KAT
- unsigned sig_len;
-
- // Disable blinding for the power-on tests because it's not needed and
- // triggers an entropy draw.
- rsa_key->flags |= RSA_FLAG_NO_BLINDING;
-
- if (!RSA_sign(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256), output,
- &sig_len, rsa_key) ||
- !check_test(kRSASignature, output, sizeof(kRSASignature),
- "RSA Sign KAT")) {
- fprintf(stderr, "RSA signing test failed.\n");
- goto err;
- }
-
- // RSA Verify KAT
- if (!RSA_verify(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256),
- kRSASignature, sizeof(kRSASignature), rsa_key)) {
- fprintf(stderr, "RSA Verify KAT failed.\n");
- goto err;
- }
-
- ec_key = self_test_ecdsa_key();
- if (ec_key == NULL) {
- fprintf(stderr, "ECDSA KeyGen failed\n");
- goto err;
- }
-
- // ECDSA Sign/Verify KAT
-
- // The 'k' value for ECDSA is fixed to avoid an entropy draw.
- uint8_t ecdsa_k[32] = {0};
- ecdsa_k[31] = 42;
-
- sig = ecdsa_sign_with_nonce_for_known_answer_test(
- kPlaintextSHA256, sizeof(kPlaintextSHA256), ec_key, ecdsa_k,
- sizeof(ecdsa_k));
-
- uint8_t ecdsa_r_bytes[sizeof(kECDSASigR)];
- uint8_t ecdsa_s_bytes[sizeof(kECDSASigS)];
- if (sig == NULL ||
- BN_num_bytes(sig->r) != sizeof(ecdsa_r_bytes) ||
- !BN_bn2bin(sig->r, ecdsa_r_bytes) ||
- BN_num_bytes(sig->s) != sizeof(ecdsa_s_bytes) ||
- !BN_bn2bin(sig->s, ecdsa_s_bytes) ||
- !check_test(kECDSASigR, ecdsa_r_bytes, sizeof(kECDSASigR), "ECDSA R") ||
- !check_test(kECDSASigS, ecdsa_s_bytes, sizeof(kECDSASigS), "ECDSA S")) {
- fprintf(stderr, "ECDSA signature KAT failed.\n");
- goto err;
- }
-
- if (!ECDSA_do_verify(kPlaintextSHA256, sizeof(kPlaintextSHA256), sig,
- ec_key)) {
- fprintf(stderr, "ECDSA verification KAT failed.\n");
- goto err;
- }
-
- // Primitive Z Computation KAT (IG 9.6).
- ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
- if (ec_group == NULL) {
- fprintf(stderr, "Failed to create P-256 group.\n");
- goto err;
- }
- ec_point_in = EC_POINT_new(ec_group);
- ec_point_out = EC_POINT_new(ec_group);
- ec_scalar = BN_new();
- uint8_t z_comp_result[65];
- if (ec_point_in == NULL || ec_point_out == NULL || ec_scalar == NULL ||
- !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point),
- NULL) ||
- !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) ||
- !EC_POINT_mul(ec_group, ec_point_out, NULL, ec_point_in, ec_scalar,
- NULL) ||
- !EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED,
- z_comp_result, sizeof(z_comp_result), NULL) ||
- !check_test(kP256PointResult, z_comp_result, sizeof(z_comp_result),
- "Z Computation Result")) {
- fprintf(stderr, "Z Computation KAT failed.\n");
- goto err;
- }
-
- // FFC Diffie-Hellman KAT
-
- BIGNUM *const ffdhe2048_value = BN_new();
- DH *const dh = self_test_dh();
- int dh_ok = 0;
- if (ffdhe2048_value && dh) {
- bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData,
- OPENSSL_ARRAY_SIZE(kFFDHE2048PublicValueData));
-
- uint8_t dh_out[sizeof(kDHOutput)];
- dh_ok =
- sizeof(dh_out) == DH_size(dh) &&
- DH_compute_key_padded(dh_out, ffdhe2048_value, dh) == sizeof(dh_out) &&
- check_test(kDHOutput, dh_out, sizeof(dh_out), "FFC DH");
- }
-
- BN_free(ffdhe2048_value);
- DH_free(dh);
- if (!dh_ok) {
- fprintf(stderr, "FFDH failed.\n");
+ if (!boringssl_self_test_sha256() ||
+ !boringssl_self_test_sha512() ||
+ !boringssl_self_test_hmac_sha256()) {
goto err;
}
// DBRG KAT
+ static const uint8_t kDRBGEntropy[48] = {
+ 0xc4, 0xda, 0x07, 0x40, 0xd5, 0x05, 0xf1, 0xee, 0x28, 0x0b, 0x95, 0xe5,
+ 0x8c, 0x49, 0x31, 0xac, 0x6d, 0xe8, 0x46, 0xa0, 0x15, 0x2f, 0xbb, 0x4a,
+ 0x3f, 0x17, 0x4c, 0xf4, 0x78, 0x7a, 0x4f, 0x1a, 0x40, 0xc2, 0xb5, 0x0b,
+ 0xab, 0xe1, 0x4a, 0xae, 0x53, 0x0b, 0xe5, 0x88, 0x6d, 0x91, 0x0a, 0x27,
+ };
+ static const uint8_t kDRBGPersonalization[18] = "BCMPersonalization";
+ static const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD ";
+ static const uint8_t kDRBGOutput[64] = {
+ 0x19, 0x1f, 0x2b, 0x49, 0x76, 0x85, 0xfd, 0x51, 0xb6, 0x56, 0xbc,
+ 0x1c, 0x7d, 0xd5, 0xdd, 0x44, 0x76, 0xa3, 0x5e, 0x17, 0x9b, 0x8e,
+ 0xb8, 0x98, 0x65, 0x12, 0xca, 0x35, 0x6c, 0xa0, 0x6f, 0xa0, 0x22,
+ 0xe4, 0xf6, 0xd8, 0x43, 0xed, 0x4e, 0x2d, 0x97, 0x39, 0x43, 0x3b,
+ 0x57, 0xfc, 0x23, 0x3f, 0x71, 0x0a, 0xe0, 0xed, 0xfe, 0xd5, 0xb8,
+ 0x67, 0x7a, 0x00, 0x39, 0xb2, 0x6e, 0xa9, 0x25, 0x97,
+ };
+ static const uint8_t kDRBGEntropy2[48] = {
+ 0xc7, 0x16, 0x1c, 0xa3, 0x6c, 0x23, 0x09, 0xb7, 0x16, 0xe9, 0x85, 0x9b,
+ 0xb9, 0x6c, 0x6d, 0x49, 0xbd, 0xc8, 0x35, 0x21, 0x03, 0xa1, 0x8c, 0xd2,
+ 0x4e, 0xf4, 0x2e, 0xc9, 0x7e, 0xf4, 0x6b, 0xf4, 0x46, 0xeb, 0x1a, 0x45,
+ 0x76, 0xc1, 0x86, 0xe9, 0x35, 0x18, 0x03, 0x76, 0x3a, 0x79, 0x12, 0xfe,
+ };
+ static const uint8_t kDRBGReseedOutput[64] = {
+ 0x00, 0xf2, 0x05, 0xaa, 0xfd, 0x11, 0x6c, 0x77, 0xbc, 0x81, 0x86,
+ 0x99, 0xca, 0x51, 0xcf, 0x80, 0x15, 0x9f, 0x02, 0x9e, 0x0b, 0xcd,
+ 0x26, 0xc8, 0x4b, 0x87, 0x8a, 0x15, 0x1a, 0xdd, 0xf2, 0xf3, 0xeb,
+ 0x94, 0x0b, 0x08, 0xc8, 0xc9, 0x57, 0xa4, 0x0b, 0x4b, 0x0f, 0x13,
+ 0xde, 0x7c, 0x0c, 0x6a, 0xac, 0x34, 0x4a, 0x9a, 0xf2, 0xd0, 0x83,
+ 0x02, 0x05, 0x17, 0xc9, 0x81, 0x8f, 0x2a, 0x81, 0x92,
+ };
CTR_DRBG_STATE drbg;
if (!CTR_DRBG_init(&drbg, kDRBGEntropy, kDRBGPersonalization,
sizeof(kDRBGPersonalization)) ||
!CTR_DRBG_generate(&drbg, output, sizeof(kDRBGOutput), kDRBGAD,
sizeof(kDRBGAD)) ||
!check_test(kDRBGOutput, output, sizeof(kDRBGOutput),
- "DBRG Generate KAT") ||
+ "DRBG Generate KAT") ||
!CTR_DRBG_reseed(&drbg, kDRBGEntropy2, kDRBGAD, sizeof(kDRBGAD)) ||
!CTR_DRBG_generate(&drbg, output, sizeof(kDRBGReseedOutput), kDRBGAD,
sizeof(kDRBGAD)) ||
!check_test(kDRBGReseedOutput, output, sizeof(kDRBGReseedOutput),
- "DRBG Reseed KAT")) {
+ "DRBG-reseed KAT")) {
fprintf(stderr, "CTR-DRBG failed.\n");
goto err;
}
@@ -832,43 +905,59 @@
}
// TLS KDF KAT
+ static const uint8_t kTLSSecret[32] = {
+ 0xab, 0xc3, 0x65, 0x7b, 0x09, 0x4c, 0x76, 0x28, 0xa0, 0xb2, 0x82,
+ 0x99, 0x6f, 0xe7, 0x5a, 0x75, 0xf4, 0x98, 0x4f, 0xd9, 0x4d, 0x4e,
+ 0xcc, 0x2f, 0xcf, 0x53, 0xa2, 0xc4, 0x69, 0xa3, 0xf7, 0x31,
+ };
+ static const char kTLSLabel[] = "FIPS self test";
+ static const uint8_t kTLSSeed1[16] = {
+ 0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2,
+ 0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65,
+ };
+ static const uint8_t kTLSSeed2[16] = {
+ 0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c,
+ 0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81,
+ };
+ static const uint8_t kTLSOutput[32] = {
+ 0xe2, 0x1d, 0xd6, 0xc2, 0x68, 0xc7, 0x57, 0x03, 0x2c, 0x2c, 0xeb,
+ 0xbb, 0xb8, 0xa9, 0x7d, 0xe9, 0xee, 0xe6, 0xc9, 0x47, 0x83, 0x0a,
+ 0xbd, 0x11, 0x60, 0x5d, 0xd5, 0x2c, 0x47, 0xb6, 0x05, 0x88,
+ };
uint8_t tls_output[sizeof(kTLSOutput)];
if (!CRYPTO_tls1_prf(EVP_sha256(), tls_output, sizeof(tls_output), kTLSSecret,
sizeof(kTLSSecret), kTLSLabel, sizeof(kTLSLabel),
kTLSSeed1, sizeof(kTLSSeed1), kTLSSeed2,
sizeof(kTLSSeed2)) ||
- !check_test(kTLSOutput, tls_output, sizeof(kTLSOutput), "TLS KDF KAT")) {
+ !check_test(kTLSOutput, tls_output, sizeof(kTLSOutput), "TLS-KDF KAT")) {
fprintf(stderr, "TLS KDF failed.\n");
goto err;
}
ret = 1;
-#if defined(BORINGSSL_FIPS_SELF_TEST_FLAG_FILE)
- // Tests were successful. Write flag file if requested.
- if (module_hash_len != 0 && getenv(kFlagWriteEnableEnvVar) != NULL) {
- const int fd = open(flag_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
- if (fd >= 0) {
- close(fd);
- }
- }
-#endif // BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
-
err:
EVP_AEAD_CTX_cleanup(&aead_ctx);
- RSA_free(rsa_key);
- EC_KEY_free(ec_key);
- EC_POINT_free(ec_point_in);
- EC_POINT_free(ec_point_out);
- EC_GROUP_free(ec_group);
- BN_free(ec_scalar);
- ECDSA_SIG_free(sig);
return ret;
}
int BORINGSSL_self_test(void) {
- return boringssl_fips_self_test(NULL, 0);
+ if (!boringssl_self_test_fast() ||
+ // When requested to run self tests, also run the lazy tests.
+ !boringssl_self_test_rsa() ||
+ !boringssl_self_test_ecc() ||
+ !boringssl_self_test_ffdh()) {
+ return 0;
+ }
+
+ return 1;
}
+#if defined(BORINGSSL_FIPS)
+int boringssl_self_test_startup(void) {
+ return boringssl_self_test_fast();
+}
+#endif
+
#endif // !_MSC_VER
diff --git a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 61f67cb..2abd065 100755
--- a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -126,15 +126,12 @@
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
#
-# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
-# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
-# did not tie them together until after $shaext was added.
+# This file also has an AVX2 implementation, controlled by setting $avx to 2.
+# For now, we intentionally disable it. While it gives a 13-16% perf boost, the
+# CFI annotations are wrong. It allocates stack in a loop and should be
+# rewritten to avoid this.
$avx = 1;
-
-# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
-# been tested.
-$shaext=0; ### set to zero if compiling for 1.0.1
-$avx=1 if (!$shaext && $avx);
+$shaext = 1;
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
@@ -275,7 +272,7 @@
___
$code.=<<___ if ($SZ==4 && $shaext);
test \$`1<<29`,%r11d # check for SHA
- jnz _shaext_shortcut
+ jnz .Lshaext_shortcut
___
# XOP codepath removed.
$code.=<<___ if ($avx>1);
@@ -559,7 +556,8 @@
.type sha256_block_data_order_shaext,\@function,3
.align 64
sha256_block_data_order_shaext:
-_shaext_shortcut:
+.cfi_startproc
+.Lshaext_shortcut:
___
$code.=<<___ if ($win64);
lea `-8-5*16`(%rsp),%rsp
@@ -703,6 +701,7 @@
___
$code.=<<___;
ret
+.cfi_endproc
.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
___
}}}
diff --git a/src/crypto/hpke/hpke.c b/src/crypto/hpke/hpke.c
index c71ac2a..827ffaa 100644
--- a/src/crypto/hpke/hpke.c
+++ b/src/crypto/hpke/hpke.c
@@ -30,7 +30,7 @@
#include "../internal.h"
-// This file implements draft-irtf-cfrg-hpke-12.
+// This file implements RFC 9180.
#define MAX_SEED_LEN X25519_PRIVATE_KEY_LEN
#define MAX_SHARED_SECRET_LEN SHA256_DIGEST_LENGTH
@@ -115,7 +115,7 @@
// KEM implementations.
// dhkem_extract_and_expand implements the ExtractAndExpand operation in the
-// DHKEM construction. See section 4.1 of draft-irtf-cfrg-hpke-12.
+// DHKEM construction. See section 4.1 of RFC 9180.
static int dhkem_extract_and_expand(uint16_t kem_id, const EVP_MD *hkdf_md,
uint8_t *out_key, size_t out_len,
const uint8_t *dh, size_t dh_len,
diff --git a/src/crypto/hpke/translate_test_vectors.py b/src/crypto/hpke/translate_test_vectors.py
index a4e399b..a1fffcf 100755
--- a/src/crypto/hpke/translate_test_vectors.py
+++ b/src/crypto/hpke/translate_test_vectors.py
@@ -19,7 +19,7 @@
Usage: translate_test_vectors.py TEST_VECTORS_JSON_FILE
The TEST_VECTORS_JSON_FILE is expected to come from the JSON copy of
-draft-irtf-cfrg-hpke-12's test vectors, linked from its [TestVectors] citation.
+RFC 9180's test vectors, linked from its [TestVectors] citation.
The output is written to "hpke_test_vectors.txt".
"""
diff --git a/src/crypto/hrss/hrss.c b/src/crypto/hrss/hrss.c
index 8e21068..388c9a9 100644
--- a/src/crypto/hrss/hrss.c
+++ b/src/crypto/hrss/hrss.c
@@ -1314,8 +1314,7 @@
static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
const struct poly *a, const struct poly *b) {
#if defined(POLY_RQ_MUL_ASM)
- const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
- if (has_avx2) {
+ if (CRYPTO_is_AVX2_capable()) {
poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
return;
}
diff --git a/src/crypto/hrss/hrss_test.cc b/src/crypto/hrss/hrss_test.cc
index 0693c82..bab968c 100644
--- a/src/crypto/hrss/hrss_test.cc
+++ b/src/crypto/hrss/hrss_test.cc
@@ -453,8 +453,7 @@
#if defined(POLY_RQ_MUL_ASM) && defined(SUPPORTS_ABI_TEST)
TEST(HRSS, ABI) {
- const bool has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
- if (!has_avx2) {
+ if (!CRYPTO_is_AVX2_capable()) {
fprintf(stderr, "Skipping ABI test due to lack of AVX2 support.\n");
return;
}
diff --git a/src/crypto/impl_dispatch_test.cc b/src/crypto/impl_dispatch_test.cc
index dae9e96..631e78f 100644
--- a/src/crypto/impl_dispatch_test.cc
+++ b/src/crypto/impl_dispatch_test.cc
@@ -33,9 +33,9 @@
public:
void SetUp() override {
#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
- aesni_ = OPENSSL_ia32cap_P[1] & (1 << (57 - 32));
- avx_movbe_ = ((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41;
- ssse3_ = OPENSSL_ia32cap_P[1] & (1 << (41 - 32));
+ aesni_ = CRYPTO_is_AESNI_capable();
+ avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
+ ssse3_ = CRYPTO_is_SSSE3_capable();
is_x86_64_ =
#if defined(OPENSSL_X86_64)
true;
diff --git a/src/crypto/internal.h b/src/crypto/internal.h
index 42f94d5..78dbbbf 100644
--- a/src/crypto/internal.h
+++ b/src/crypto/internal.h
@@ -121,6 +121,10 @@
#include <valgrind/memcheck.h>
#endif
+#if defined(BORINGSSL_FIPS_BREAK_TESTS)
+#include <stdlib.h>
+#endif
+
#if !defined(__cplusplus)
#if defined(_MSC_VER)
#define alignas(x) __declspec(align(x))
@@ -932,19 +936,50 @@
// FIPS functions.
#if defined(BORINGSSL_FIPS)
+
// BORINGSSL_FIPS_abort is called when a FIPS power-on or continuous test
// fails. It prevents any further cryptographic operations by the current
// process.
void BORINGSSL_FIPS_abort(void) __attribute__((noreturn));
-#endif
-// boringssl_fips_self_test runs the FIPS KAT-based self tests. It returns one
-// on success and zero on error. The argument is the integrity hash of the FIPS
-// module and may be used to check and write flag files to suppress duplicate
-// self-tests. If |module_hash_len| is zero then no flag file will be checked
-// nor written and tests will always be run.
-int boringssl_fips_self_test(const uint8_t *module_hash,
- size_t module_hash_len);
+// boringssl_self_test_startup runs all startup self tests and returns one on
+// success or zero on error. Startup self tests do not include lazy tests.
+// Call |BORINGSSL_self_test| to run every self test.
+int boringssl_self_test_startup(void);
+
+// boringssl_ensure_rsa_self_test checks whether the RSA self-test has been run
+// in this address space. If not, it runs it and crashes the address space if
+// unsuccessful.
+void boringssl_ensure_rsa_self_test(void);
+
+// boringssl_ensure_ecc_self_test checks whether the ECDSA and ECDH self-test
+// has been run in this address space. If not, it runs it and crashes the
+// address space if unsuccessful.
+void boringssl_ensure_ecc_self_test(void);
+
+// boringssl_ensure_ffdh_self_test checks whether the FFDH self-test has been
+// run in this address space. If not, it runs it and crashes the address space
+// if unsuccessful.
+void boringssl_ensure_ffdh_self_test(void);
+
+#else
+
+// Outside of FIPS mode, the lazy tests are no-ops.
+
+OPENSSL_INLINE void boringssl_ensure_rsa_self_test(void) {}
+OPENSSL_INLINE void boringssl_ensure_ecc_self_test(void) {}
+OPENSSL_INLINE void boringssl_ensure_ffdh_self_test(void) {}
+
+#endif // FIPS
+
+// boringssl_self_test_sha256 performs a SHA-256 KAT.
+int boringssl_self_test_sha256(void);
+
+// boringssl_self_test_sha512 performs a SHA-512 KAT.
+int boringssl_self_test_sha512(void);
+
+// boringssl_self_test_hmac_sha256 performs an HMAC-SHA-256 KAT.
+int boringssl_self_test_hmac_sha256(void);
#if defined(BORINGSSL_FIPS_COUNTERS)
void boringssl_fips_inc_counter(enum fips_counter_t counter);
@@ -952,6 +987,17 @@
OPENSSL_INLINE void boringssl_fips_inc_counter(enum fips_counter_t counter) {}
#endif
+#if defined(BORINGSSL_FIPS_BREAK_TESTS)
+OPENSSL_INLINE int boringssl_fips_break_test(const char *test) {
+ const char *const value = getenv("BORINGSSL_FIPS_BREAK_TEST");
+ return value != NULL && strcmp(value, test) == 0;
+}
+#else
+OPENSSL_INLINE int boringssl_fips_break_test(const char *test) {
+ return 0;
+}
+#endif // BORINGSSL_FIPS_BREAK_TESTS
+
// Runtime CPU feature support
@@ -978,14 +1024,126 @@
extern uint32_t OPENSSL_ia32cap_P[4];
#if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY)
-const uint32_t *OPENSSL_ia32cap_get(void);
+// The FIPS module, as a static library, requires an out-of-line version of
+// |OPENSSL_ia32cap_get| so accesses can be rewritten by delocate. Mark the
+// function const so multiple accesses can be optimized together.
+const uint32_t *OPENSSL_ia32cap_get(void) __attribute__((const));
#else
OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) {
return OPENSSL_ia32cap_P;
}
#endif
+// See Intel manual, volume 2A, table 3-11.
+
+OPENSSL_INLINE int CRYPTO_is_FXSR_capable(void) {
+#if defined(__FXSR__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[0] & (1 << 24)) != 0;
#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_intel_cpu(void) {
+ // The reserved bit 30 is used to indicate an Intel CPU.
+ return (OPENSSL_ia32cap_get()[0] & (1 << 30)) != 0;
+}
+
+// See Intel manual, volume 2A, table 3-10.
+
+OPENSSL_INLINE int CRYPTO_is_PCLMUL_capable(void) {
+#if defined(__PCLMUL__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1 << 1)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSSE3_capable(void) {
+#if defined(__SSSE3__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1 << 9)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSE4_1_capable(void) {
+#if defined(__SSE4_1__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_MOVBE_capable(void) {
+#if defined(__MOVBE__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1 << 22)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) {
+#if defined(__AES__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) {
+#if defined(__AVX__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_RDRAND_capable(void) {
+ // The GCC/Clang feature name and preprocessor symbol for RDRAND are "rdrnd"
+ // and |__RDRND__|, respectively.
+#if defined(__RDRND__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+#endif
+}
+
+// See Intel manual, volume 2A, table 3-8.
+
+OPENSSL_INLINE int CRYPTO_is_BMI1_capable(void) {
+#if defined(__BMI1__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[2] & (1 << 3)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX2_capable(void) {
+#if defined(__AVX2__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[2] & (1 << 5)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_BMI2_capable(void) {
+#if defined(__BMI2__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[2] & (1 << 8)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) {
+#if defined(__ADX__)
+ return 1;
+#else
+ return (OPENSSL_ia32cap_get()[2] & (1 << 19)) != 0;
+#endif
+}
+
+#endif // OPENSSL_X86 || OPENSSL_X86_64
#if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
diff --git a/src/crypto/pem/pem_all.c b/src/crypto/pem/pem_all.c
index e419774..706b7f4 100644
--- a/src/crypto/pem/pem_all.c
+++ b/src/crypto/pem/pem_all.c
@@ -200,7 +200,7 @@
IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA,
DSAPrivateKey)
- IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
+IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
DSA *PEM_read_DSAPrivateKey(FILE *fp, DSA **dsa, pem_password_cb *cb, void *u)
{
EVP_PKEY *pktmp;
@@ -237,7 +237,7 @@
IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY,
ECPrivateKey)
- IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
+IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
EC_KEY *PEM_read_ECPrivateKey(FILE *fp, EC_KEY **eckey, pem_password_cb *cb,
void *u)
{
@@ -247,6 +247,6 @@
}
-IMPLEMENT_PEM_write_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
+IMPLEMENT_PEM_rw_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
- IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
+IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
diff --git a/src/crypto/pem/pem_pkey.c b/src/crypto/pem/pem_pkey.c
index 48d8c96..f75486d 100644
--- a/src/crypto/pem/pem_pkey.c
+++ b/src/crypto/pem/pem_pkey.c
@@ -176,39 +176,3 @@
BIO_free(b);
return ret;
}
-
-
-/* Transparently read in PKCS#3 or X9.42 DH parameters */
-
-DH *PEM_read_bio_DHparams(BIO *bp, DH **x, pem_password_cb *cb, void *u)
-{
- char *nm = NULL;
- const unsigned char *p = NULL;
- unsigned char *data = NULL;
- long len;
- DH *ret = NULL;
-
- if (!PEM_bytes_read_bio(&data, &len, &nm, PEM_STRING_DHPARAMS, bp, cb, u))
- return NULL;
- p = data;
-
- ret = d2i_DHparams(x, &p, len);
-
- if (ret == NULL)
- OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB);
- OPENSSL_free(nm);
- OPENSSL_free(data);
- return ret;
-}
-
-DH *PEM_read_DHparams(FILE *fp, DH **x, pem_password_cb *cb, void *u)
-{
- BIO *b = BIO_new_fp(fp, BIO_NOCLOSE);
- if (b == NULL) {
- OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB);
- return NULL;
- }
- DH *ret = PEM_read_bio_DHparams(b, x, cb, u);
- BIO_free(b);
- return ret;
-}
diff --git a/src/crypto/pkcs8/pkcs12_test.cc b/src/crypto/pkcs8/pkcs12_test.cc
index e67630d..958bd8d 100644
--- a/src/crypto/pkcs8/pkcs12_test.cc
+++ b/src/crypto/pkcs8/pkcs12_test.cc
@@ -34,7 +34,7 @@
static const char kPassword[] = "foo";
// kUnicodePassword is the password for unicode_password.p12
-static const char kUnicodePassword[] = u8"Hello, 世界";
+static const char kUnicodePassword[] = "Hello, 世界";
static bssl::Span<const uint8_t> StringToBytes(const std::string &str) {
return bssl::MakeConstSpan(reinterpret_cast<const uint8_t *>(str.data()),
@@ -391,7 +391,7 @@
{bssl::Span<const uint8_t>(kTestCert2)}, 0, 0, 0, 0);
// Test some Unicode.
- TestRoundTrip(kPassword, u8"Hello, 世界!",
+ TestRoundTrip(kPassword, "Hello, 世界!",
bssl::Span<const uint8_t>(kTestKey),
bssl::Span<const uint8_t>(kTestCert),
{bssl::Span<const uint8_t>(kTestCert2)}, 0, 0, 0, 0);
diff --git a/src/decrepit/des/cfb64ede.c b/src/decrepit/des/cfb64ede.c
index 6c39923..820c52e 100644
--- a/src/decrepit/des/cfb64ede.c
+++ b/src/decrepit/des/cfb64ede.c
@@ -58,7 +58,7 @@
#include <openssl/des.h>
-#include "../../crypto/fipsmodule/des/internal.h"
+#include "../../crypto/des/internal.h"
#include "../../crypto/internal.h"
diff --git a/src/include/openssl/hpke.h b/src/include/openssl/hpke.h
index 56251b7..e2c9855 100644
--- a/src/include/openssl/hpke.h
+++ b/src/include/openssl/hpke.h
@@ -30,7 +30,7 @@
// Hybrid Public Key Encryption (HPKE) enables a sender to encrypt messages to a
// receiver with a public key.
//
-// See https://tools.ietf.org/html/draft-irtf-cfrg-hpke-12.
+// See RFC 9180.
// Parameters.
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 232c627..a3b530e 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -4039,10 +4039,16 @@
// |len| bytes from |buf| contain the handshake message, one-byte
// ChangeCipherSpec body, and two-byte alert, respectively.
//
+// In connections that enable ECH, |cb| is additionally called with
+// |content_type| = |SSL3_RT_CLIENT_HELLO_INNER| for each ClientHelloInner that
+// is encrypted or decrypted. The |len| bytes from |buf| contain the
+// ClientHelloInner, including the reconstructed outer extensions and handshake
+// header.
+//
// For a V2ClientHello, |version| is |SSL2_VERSION|, |content_type| is zero, and
// the |len| bytes from |buf| contain the V2ClientHello structure.
OPENSSL_EXPORT void SSL_CTX_set_msg_callback(
- SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type,
+ SSL_CTX *ctx, void (*cb)(int is_write, int version, int content_type,
const void *buf, size_t len, SSL *ssl, void *arg));
// SSL_CTX_set_msg_callback_arg sets the |arg| parameter of the message
@@ -5598,7 +5604,7 @@
#define SSL_R_INVALID_ECH_PUBLIC_NAME 317
#define SSL_R_INVALID_ECH_CONFIG_LIST 318
#define SSL_R_ECH_REJECTED 319
-#define SSL_R_OUTER_EXTENSION_NOT_FOUND 320
+#define SSL_R_INVALID_OUTER_EXTENSION 320
#define SSL_R_INCONSISTENT_ECH_NEGOTIATION 321
#define SSL_R_SSLV3_ALERT_CLOSE_NOTIFY 1000
#define SSL_R_SSLV3_ALERT_UNEXPECTED_MESSAGE 1010
diff --git a/src/include/openssl/ssl3.h b/src/include/openssl/ssl3.h
index e3910f0..533142c 100644
--- a/src/include/openssl/ssl3.h
+++ b/src/include/openssl/ssl3.h
@@ -275,6 +275,7 @@
// Pseudo content type for SSL/TLS header info
#define SSL3_RT_HEADER 0x100
+#define SSL3_RT_CLIENT_HELLO_INNER 0x101
#define SSL3_AL_WARNING 1
#define SSL3_AL_FATAL 2
diff --git a/src/ssl/encrypted_client_hello.cc b/src/ssl/encrypted_client_hello.cc
index 64fee3d..9e9adfe 100644
--- a/src/ssl/encrypted_client_hello.cc
+++ b/src/ssl/encrypted_client_hello.cc
@@ -203,6 +203,12 @@
OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
return false;
}
+ // The ECH extension itself is not in the AAD and may not be referenced.
+ if (want == TLSEXT_TYPE_encrypted_client_hello) {
+ *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+ OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_OUTER_EXTENSION);
+ return false;
+ }
// Seek to |want| in |outer_extensions|. |ext_list| is required to match
// ClientHelloOuter in order.
uint16_t found;
@@ -210,7 +216,7 @@
do {
if (CBS_len(&outer_extensions) == 0) {
*out_alert = SSL_AD_ILLEGAL_PARAMETER;
- OPENSSL_PUT_ERROR(SSL, SSL_R_OUTER_EXTENSION_NOT_FOUND);
+ OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_OUTER_EXTENSION);
return false;
}
if (!CBS_get_u16(&outer_extensions, &found) ||
@@ -252,8 +258,8 @@
return true;
}
-bool ssl_client_hello_decrypt(EVP_HPKE_CTX *hpke_ctx, Array<uint8_t> *out,
- bool *out_is_decrypt_error,
+bool ssl_client_hello_decrypt(SSL_HANDSHAKE *hs, uint8_t *out_alert,
+ bool *out_is_decrypt_error, Array<uint8_t> *out,
const SSL_CLIENT_HELLO *client_hello_outer,
Span<const uint8_t> payload) {
*out_is_decrypt_error = false;
@@ -264,6 +270,7 @@
Array<uint8_t> aad;
if (!aad.CopyFrom(MakeConstSpan(client_hello_outer->client_hello,
client_hello_outer->client_hello_len))) {
+ *out_alert = SSL_AD_INTERNAL_ERROR;
return false;
}
@@ -278,35 +285,47 @@
payload.data() - client_hello_outer->client_hello, payload.size());
OPENSSL_memset(payload_aad.data(), 0, payload_aad.size());
+ // Decrypt the EncodedClientHelloInner.
+ Array<uint8_t> encoded;
#if defined(BORINGSSL_UNSAFE_FUZZER_MODE)
// In fuzzer mode, disable encryption to improve coverage. We reserve a short
// input to signal decryption failure, so the fuzzer can explore fallback to
// ClientHelloOuter.
const uint8_t kBadPayload[] = {0xff};
if (payload == kBadPayload) {
+ *out_alert = SSL_AD_DECRYPT_ERROR;
*out_is_decrypt_error = true;
OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
return false;
}
- if (!out->CopyFrom(payload)) {
+ if (!encoded.CopyFrom(payload)) {
+ *out_alert = SSL_AD_INTERNAL_ERROR;
return false;
}
#else
- // Attempt to decrypt into |out|.
- if (!out->Init(payload.size())) {
- OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+ if (!encoded.Init(payload.size())) {
+ *out_alert = SSL_AD_INTERNAL_ERROR;
return false;
}
size_t len;
- if (!EVP_HPKE_CTX_open(hpke_ctx, out->data(), &len, out->size(),
- payload.data(), payload.size(), aad.data(),
- aad.size())) {
+ if (!EVP_HPKE_CTX_open(hs->ech_hpke_ctx.get(), encoded.data(), &len,
+ encoded.size(), payload.data(), payload.size(),
+ aad.data(), aad.size())) {
+ *out_alert = SSL_AD_DECRYPT_ERROR;
*out_is_decrypt_error = true;
OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
return false;
}
- out->Shrink(len);
+ encoded.Shrink(len);
#endif
+
+ if (!ssl_decode_client_hello_inner(hs->ssl, out_alert, out, encoded,
+ client_hello_outer)) {
+ return false;
+ }
+
+ ssl_do_msg_callback(hs->ssl, /*is_write=*/0, SSL3_RT_CLIENT_HELLO_INNER,
+ *out);
return true;
}
@@ -789,6 +808,8 @@
binder_len);
}
+ ssl_do_msg_callback(ssl, /*is_write=*/1, SSL3_RT_CLIENT_HELLO_INNER,
+ hello_inner);
if (!hs->inner_transcript.Update(hello_inner)) {
return false;
}
diff --git a/src/ssl/handshake_client.cc b/src/ssl/handshake_client.cc
index 17b41e0..e630121 100644
--- a/src/ssl/handshake_client.cc
+++ b/src/ssl/handshake_client.cc
@@ -331,7 +331,7 @@
Array<uint8_t> msg;
if (!ssl->method->init_message(ssl, cbb.get(), &body, SSL3_MT_CLIENT_HELLO) ||
!ssl_write_client_hello_without_extensions(hs, &body, type,
- /*empty_session_id*/ false) ||
+ /*empty_session_id=*/false) ||
!ssl_add_clienthello_tlsext(hs, &body, /*out_encoded=*/nullptr,
&needs_psk_binder, type, CBB_len(&body)) ||
!ssl->method->finish_message(ssl, cbb.get(), &msg)) {
diff --git a/src/ssl/handshake_server.cc b/src/ssl/handshake_server.cc
index 1d03c55..15820be 100644
--- a/src/ssl/handshake_server.cc
+++ b/src/ssl/handshake_server.cc
@@ -554,29 +554,22 @@
ERR_clear_error();
continue;
}
- Array<uint8_t> encoded_client_hello_inner;
bool is_decrypt_error;
- if (!ssl_client_hello_decrypt(hs->ech_hpke_ctx.get(),
- &encoded_client_hello_inner,
- &is_decrypt_error, client_hello, payload)) {
+ if (!ssl_client_hello_decrypt(hs, out_alert, &is_decrypt_error,
+ &hs->ech_client_hello_buf, client_hello,
+ payload)) {
if (is_decrypt_error) {
// Ignore the error and try another ECHConfig.
ERR_clear_error();
+ // The |out_alert| calling convention currently relies on a default of
+ // |SSL_AD_DECODE_ERROR|. https://crbug.com/boringssl/373 tracks
+ // switching to sum types, which avoids this.
+ *out_alert = SSL_AD_DECODE_ERROR;
continue;
}
OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
return false;
}
-
- // Recover the ClientHelloInner from the EncodedClientHelloInner.
- bssl::Array<uint8_t> client_hello_inner;
- if (!ssl_decode_client_hello_inner(ssl, out_alert, &client_hello_inner,
- encoded_client_hello_inner,
- client_hello)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
- return false;
- }
- hs->ech_client_hello_buf = std::move(client_hello_inner);
hs->ech_config_id = config_id;
ssl->s3->ech_status = ssl_ech_accepted;
return true;
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index 5196f17..8f68fc5 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -1498,17 +1498,19 @@
// ClientHelloOuter |client_hello_outer|. If successful, it writes the recovered
// ClientHelloInner to |out_client_hello_inner|. It returns true on success and
// false on failure.
+//
+// This function is exported for fuzzing.
OPENSSL_EXPORT bool ssl_decode_client_hello_inner(
SSL *ssl, uint8_t *out_alert, Array<uint8_t> *out_client_hello_inner,
Span<const uint8_t> encoded_client_hello_inner,
const SSL_CLIENT_HELLO *client_hello_outer);
-// ssl_client_hello_decrypt attempts to decrypt the |payload| and writes the
-// result to |*out|. |payload| must point into |client_hello_outer|. It returns
-// true on success and false on error. On error, it sets |*out_is_decrypt_error|
-// to whether the failure was due to a bad ciphertext.
-bool ssl_client_hello_decrypt(EVP_HPKE_CTX *hpke_ctx, Array<uint8_t> *out,
- bool *out_is_decrypt_error,
+// ssl_client_hello_decrypt attempts to decrypt and decode the |payload|. It
+// writes the result to |*out|. |payload| must point into |client_hello_outer|.
+// It returns true on success and false on error. On error, it sets
+// |*out_is_decrypt_error| to whether the failure was due to a bad ciphertext.
+bool ssl_client_hello_decrypt(SSL_HANDSHAKE *hs, uint8_t *out_alert,
+ bool *out_is_decrypt_error, Array<uint8_t> *out,
const SSL_CLIENT_HELLO *client_hello_outer,
Span<const uint8_t> payload);
@@ -3511,7 +3513,7 @@
bssl::UniquePtr<bssl::CERT> cert;
// callback that allows applications to peek at protocol messages
- void (*msg_callback)(int write_p, int version, int content_type,
+ void (*msg_callback)(int is_write, int version, int content_type,
const void *buf, size_t len, SSL *ssl,
void *arg) = nullptr;
void *msg_callback_arg = nullptr;
diff --git a/src/ssl/test/CMakeLists.txt b/src/ssl/test/CMakeLists.txt
index bb9bd81..f02d6e2 100644
--- a/src/ssl/test/CMakeLists.txt
+++ b/src/ssl/test/CMakeLists.txt
@@ -17,7 +17,7 @@
target_link_libraries(bssl_shim test_support_lib ssl crypto)
-if(UNIX AND NOT APPLE AND NOT ANDROID)
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
add_executable(
handshaker
diff --git a/src/ssl/test/runner/hpke/hpke.go b/src/ssl/test/runner/hpke/hpke.go
index e6fc7be..b65dcf3 100644
--- a/src/ssl/test/runner/hpke/hpke.go
+++ b/src/ssl/test/runner/hpke/hpke.go
@@ -14,7 +14,7 @@
// Package hpke implements Hybrid Public Key Encryption (HPKE).
//
-// See https://tools.ietf.org/html/draft-irtf-cfrg-hpke-12.
+// See RFC 9180.
package hpke
import (
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index cfff714..4c1c955 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -16776,9 +16776,7 @@
},
shouldFail: true,
expectedLocalError: "remote error: illegal parameter",
- // The decoding algorithm relies on the ordering requirement, so
- // the wrong order appears as a missing extension.
- expectedError: ":OUTER_EXTENSION_NOT_FOUND:",
+ expectedError: ":INVALID_OUTER_EXTENSION:",
})
// Test that the server rejects duplicated values in ech_outer_extensions.
@@ -16812,9 +16810,7 @@
},
shouldFail: true,
expectedLocalError: "remote error: illegal parameter",
- // The decoding algorithm relies on the ordering requirement, so
- // duplicates appear as missing extensions.
- expectedError: ":OUTER_EXTENSION_NOT_FOUND:",
+ expectedError: ":INVALID_OUTER_EXTENSION:",
})
// Test that the server rejects references to missing extensions in
@@ -16843,7 +16839,7 @@
},
shouldFail: true,
expectedLocalError: "remote error: illegal parameter",
- expectedError: ":DECODE_ERROR:",
+ expectedError: ":INVALID_OUTER_EXTENSION:",
})
// Test that the server rejects a references to the ECH extension in
@@ -16871,7 +16867,46 @@
},
shouldFail: true,
expectedLocalError: "remote error: illegal parameter",
- expectedError: ":DECODE_ERROR:",
+ expectedError: ":INVALID_OUTER_EXTENSION:",
+ })
+
+ // Test the message callback is correctly reported with ECH.
+ clientAndServerHello := "read hs 1\nread clienthelloinner\nwrite hs 2\n"
+ expectMsgCallback := clientAndServerHello + "write ccs\n"
+ if hrr {
+ expectMsgCallback += clientAndServerHello
+ }
+ // EncryptedExtensions onwards.
+ expectMsgCallback += `write hs 8
+write hs 11
+write hs 15
+write hs 20
+read hs 20
+write hs 4
+write hs 4
+`
+ testCases = append(testCases, testCase{
+ testType: serverTest,
+ protocol: protocol,
+ name: prefix + "ECH-Server-MessageCallback" + suffix,
+ config: Config{
+ ServerName: "secret.example",
+ ClientECHConfig: echConfig.ECHConfig,
+ DefaultCurves: defaultCurves,
+ Bugs: ProtocolBugs{
+ NoCloseNotify: true, // Align QUIC and TCP traces.
+ },
+ },
+ flags: []string{
+ "-ech-server-config", base64FlagValue(echConfig.ECHConfig.Raw),
+ "-ech-server-key", base64FlagValue(echConfig.Key),
+ "-ech-is-retry-config", "1",
+ "-expect-ech-accept",
+ "-expect-msg-callback", expectMsgCallback,
+ },
+ expectations: connectionExpectations{
+ echAccepted: true,
+ },
})
}
@@ -18622,6 +18657,60 @@
shouldFail: true,
expectedError: ":INCONSISTENT_ECH_NEGOTIATION:",
})
+
+ // Test the message callback is correctly reported, with and without
+ // HelloRetryRequest.
+ clientAndServerHello := "write clienthelloinner\nwrite hs 1\nread hs 2\n"
+ // EncryptedExtensions onwards.
+ finishHandshake := `read hs 8
+read hs 11
+read hs 15
+read hs 20
+write hs 20
+read hs 4
+read hs 4
+`
+ testCases = append(testCases, testCase{
+ testType: clientTest,
+ protocol: protocol,
+ name: prefix + "ECH-Client-MessageCallback",
+ config: Config{
+ MinVersion: VersionTLS13,
+ MaxVersion: VersionTLS13,
+ ServerECHConfigs: []ServerECHConfig{echConfig},
+ Bugs: ProtocolBugs{
+ NoCloseNotify: true, // Align QUIC and TCP traces.
+ },
+ },
+ flags: []string{
+ "-ech-config-list", base64FlagValue(CreateECHConfigList(echConfig.ECHConfig.Raw)),
+ "-expect-ech-accept",
+ "-expect-msg-callback", clientAndServerHello + "write ccs\n" + finishHandshake,
+ },
+ expectations: connectionExpectations{echAccepted: true},
+ })
+ testCases = append(testCases, testCase{
+ testType: clientTest,
+ protocol: protocol,
+ name: prefix + "ECH-Client-MessageCallback-HelloRetryRequest",
+ config: Config{
+ MinVersion: VersionTLS13,
+ MaxVersion: VersionTLS13,
+ CurvePreferences: []CurveID{CurveP384},
+ ServerECHConfigs: []ServerECHConfig{echConfig},
+ Bugs: ProtocolBugs{
+ ExpectMissingKeyShare: true, // Check we triggered HRR.
+ NoCloseNotify: true, // Align QUIC and TCP traces.
+ },
+ },
+ flags: []string{
+ "-ech-config-list", base64FlagValue(CreateECHConfigList(echConfig.ECHConfig.Raw)),
+ "-expect-ech-accept",
+ "-expect-hrr", // Check we triggered HRR.
+ "-expect-msg-callback", clientAndServerHello + "write ccs\n" + clientAndServerHello + finishHandshake,
+ },
+ expectations: connectionExpectations{echAccepted: true},
+ })
}
}
@@ -19220,8 +19309,22 @@
noneOfPattern = strings.Split(*skipTest, ";")
}
+ shardIndex, shardTotal, err := getSharding()
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(1)
+ }
+
+ if shardTotal > 0 {
+ fmt.Printf("This is shard %d of 0..%d (inclusive)\n", shardIndex, shardTotal-1)
+ }
+
var foundTest bool
for i := range testCases {
+ if shardTotal > 0 && i%shardTotal != shardIndex {
+ continue
+ }
+
matched, err := match(oneOfPatternIfAny, noneOfPattern, testCases[i].name)
if err != nil {
fmt.Fprintf(os.Stderr, "Error matching pattern: %s\n", err)
@@ -19259,7 +19362,7 @@
}
}
- if !foundTest {
+ if !foundTest && shardTotal == 0 {
fmt.Fprintf(os.Stderr, "No tests run\n")
os.Exit(1)
}
diff --git a/src/ssl/test/runner/sharding.go b/src/ssl/test/runner/sharding.go
new file mode 100644
index 0000000..5061a6f
--- /dev/null
+++ b/src/ssl/test/runner/sharding.go
@@ -0,0 +1,77 @@
+// Copyright (c) 2022, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package runner
+
+import (
+ "fmt"
+ "io/ioutil"
+ "os"
+ "strconv"
+)
+
+const (
+ shardStatusFileEnv = "TEST_SHARD_STATUS_FILE"
+ shardTotalEnv = "TEST_TOTAL_SHARDS"
+ shardIndexEnv = "TEST_SHARD_INDEX"
+ shardPrefix = "RUNNER_"
+)
+
+func init() {
+ // When run under `go test`, init() functions may be run twice if the
+ // test binary ends up forking and execing itself. Therefore we move
+ // the environment variables to names that don't interfere with Go's
+ // own support for sharding. If we recorded and erased them, then they
+ // wouldn't exist the second time the binary runs.
+ for _, key := range []string{shardStatusFileEnv, shardTotalEnv, shardIndexEnv} {
+ value := os.Getenv(key)
+ if len(value) > 0 {
+ os.Setenv(shardPrefix+key, value)
+ os.Setenv(key, "")
+ }
+ }
+}
+
+// getSharding returns the shard index and count, or zeros if sharding is not
+// enabled.
+func getSharding() (index, total int, err error) {
+ statusFile := os.Getenv(shardPrefix + shardStatusFileEnv)
+ totalNumStr := os.Getenv(shardPrefix + shardTotalEnv)
+ indexStr := os.Getenv(shardPrefix + shardIndexEnv)
+ if len(totalNumStr) == 0 || len(indexStr) == 0 {
+ return 0, 0, nil
+ }
+
+ totalNum, err := strconv.Atoi(totalNumStr)
+ if err != nil {
+ return 0, 0, fmt.Errorf("$%s is %q, but expected a number\n", shardTotalEnv, totalNumStr)
+ }
+
+ index, err = strconv.Atoi(indexStr)
+ if err != nil {
+ return 0, 0, fmt.Errorf("$%s is %q, but expected a number\n", shardIndexEnv, indexStr)
+ }
+
+ if index < 0 || index >= totalNum {
+ return 0, 0, fmt.Errorf("shard index/total of %d/%d is invalid\n", index, totalNum)
+ }
+
+ if len(statusFile) > 0 {
+ if err := ioutil.WriteFile(statusFile, nil, 0664); err != nil {
+ return 0, 0, err
+ }
+ }
+
+ return index, totalNum, nil
+}
diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc
index 9a0f63d..a6409d6 100644
--- a/src/ssl/test/test_config.cc
+++ b/src/ssl/test/test_config.cc
@@ -602,6 +602,7 @@
state->msg_callback_text += "v2clienthello\n";
return;
+ case SSL3_RT_CLIENT_HELLO_INNER:
case SSL3_RT_HANDSHAKE: {
CBS cbs;
CBS_init(&cbs, buf_u8, len);
@@ -619,10 +620,19 @@
return;
}
char text[16];
- snprintf(text, sizeof(text), "hs %d\n", type);
- state->msg_callback_text += text;
- if (!is_write) {
- state->last_message_received = type;
+ if (content_type == SSL3_RT_CLIENT_HELLO_INNER) {
+ if (type != SSL3_MT_CLIENT_HELLO) {
+ fprintf(stderr, "Invalid header for ClientHelloInner.\n");
+ state->msg_callback_ok = false;
+ return;
+ }
+ state->msg_callback_text += "clienthelloinner\n";
+ } else {
+ snprintf(text, sizeof(text), "hs %d\n", type);
+ state->msg_callback_text += text;
+ if (!is_write) {
+ state->last_message_received = type;
+ }
}
return;
}
diff --git a/src/ssl/tls13_server.cc b/src/ssl/tls13_server.cc
index 2f000e5..dbf239d 100644
--- a/src/ssl/tls13_server.cc
+++ b/src/ssl/tls13_server.cc
@@ -658,28 +658,16 @@
}
// Decrypt the payload with the HPKE context from the first ClientHello.
- Array<uint8_t> encoded_client_hello_inner;
+ uint8_t alert = SSL_AD_DECODE_ERROR;
bool unused;
- if (!ssl_client_hello_decrypt(hs->ech_hpke_ctx.get(),
- &encoded_client_hello_inner, &unused,
- &client_hello, payload)) {
+ if (!ssl_client_hello_decrypt(hs, &alert, &unused,
+ &hs->ech_client_hello_buf, &client_hello,
+ payload)) {
// Decryption failure is fatal in the second ClientHello.
OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
- ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECRYPT_ERROR);
- return ssl_hs_error;
- }
-
- // Recover the ClientHelloInner from the EncodedClientHelloInner.
- uint8_t alert = SSL_AD_DECODE_ERROR;
- bssl::Array<uint8_t> client_hello_inner;
- if (!ssl_decode_client_hello_inner(ssl, &alert, &client_hello_inner,
- encoded_client_hello_inner,
- &client_hello)) {
- OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
return ssl_hs_error;
}
- hs->ech_client_hello_buf = std::move(client_hello_inner);
// Reparse |client_hello| from the buffer owned by |hs|.
if (!hs->GetClientHello(&msg, &client_hello)) {
diff --git a/src/tool/server.cc b/src/tool/server.cc
index 18b692d..ebecee0 100644
--- a/src/tool/server.cc
+++ b/src/tool/server.cc
@@ -132,21 +132,37 @@
static bssl::UniquePtr<X509> MakeSelfSignedCert(EVP_PKEY *evp_pkey,
const int valid_days) {
+ uint64_t serial;
bssl::UniquePtr<X509> x509(X509_new());
- uint32_t serial;
- RAND_bytes(reinterpret_cast<uint8_t*>(&serial), sizeof(serial));
- ASN1_INTEGER_set(X509_get_serialNumber(x509.get()), serial >> 1);
- X509_gmtime_adj(X509_get_notBefore(x509.get()), 0);
- X509_gmtime_adj(X509_get_notAfter(x509.get()), 60 * 60 * 24 * valid_days);
+ if (!x509 || //
+ !X509_set_version(x509.get(), X509_VERSION_3) ||
+ !RAND_bytes(reinterpret_cast<uint8_t *>(&serial), sizeof(serial)) ||
+ !ASN1_INTEGER_set_uint64(X509_get_serialNumber(x509.get()), serial) ||
+ !X509_gmtime_adj(X509_get_notBefore(x509.get()), 0) ||
+ !X509_gmtime_adj(X509_get_notAfter(x509.get()),
+ 60 * 60 * 24 * valid_days)) {
+ return nullptr;
+ }
- X509_NAME* subject = X509_get_subject_name(x509.get());
- X509_NAME_add_entry_by_txt(subject, "C", MBSTRING_ASC,
- reinterpret_cast<const uint8_t *>("US"), -1, -1,
- 0);
- X509_NAME_add_entry_by_txt(subject, "O", MBSTRING_ASC,
- reinterpret_cast<const uint8_t *>("BoringSSL"), -1,
- -1, 0);
- X509_set_issuer_name(x509.get(), subject);
+ X509_NAME *subject = X509_get_subject_name(x509.get());
+ if (!X509_NAME_add_entry_by_txt(subject, "C", MBSTRING_ASC,
+ reinterpret_cast<const uint8_t *>("US"), -1,
+ -1, 0) ||
+ !X509_NAME_add_entry_by_txt(
+ subject, "O", MBSTRING_ASC,
+ reinterpret_cast<const uint8_t *>("BoringSSL"), -1, -1, 0) ||
+ !X509_set_issuer_name(x509.get(), subject)) {
+ return nullptr;
+ }
+
+ // macOS requires an explicit EKU extension.
+ bssl::UniquePtr<STACK_OF(ASN1_OBJECT)> ekus(sk_ASN1_OBJECT_new_null());
+ if (!ekus ||
+ !sk_ASN1_OBJECT_push(ekus.get(), OBJ_nid2obj(NID_server_auth)) ||
+ !X509_add1_ext_i2d(x509.get(), NID_ext_key_usage, ekus.get(), /*crit=*/1,
+ /*flags=*/0)) {
+ return nullptr;
+ }
if (!X509_set_pubkey(x509.get(), evp_pkey)) {
fprintf(stderr, "Failed to set public key.\n");
diff --git a/src/util/BUILD.toplevel b/src/util/BUILD.toplevel
index 462a24f..cfa695a 100644
--- a/src/util/BUILD.toplevel
+++ b/src/util/BUILD.toplevel
@@ -18,10 +18,11 @@
"crypto_headers",
"crypto_internal_headers",
"crypto_sources",
+ "crypto_sources_apple_aarch64",
+ "crypto_sources_apple_x86_64",
"crypto_sources_linux_aarch64",
"crypto_sources_linux_ppc64le",
"crypto_sources_linux_x86_64",
- "crypto_sources_mac_x86_64",
"fips_fragments",
"ssl_headers",
"ssl_internal_headers",
@@ -36,52 +37,42 @@
config_setting(
name = "linux_aarch64",
- values = {"cpu": "aarch64"},
+ constraint_values = [
+ "@platforms//os:linux",
+ "@platforms//cpu:aarch64",
+ ],
)
config_setting(
name = "linux_x86_64",
- values = {"cpu": "k8"},
+ constraint_values = [
+ "@platforms//os:linux",
+ "@platforms//cpu:x86_64",
+ ],
)
config_setting(
name = "linux_ppc64le",
- values = {"cpu": "ppc"},
+ constraint_values = [
+ "@platforms//os:linux",
+ "@platforms//cpu:ppc",
+ ],
)
config_setting(
- name = "mac_x86_64",
- values = {"cpu": "darwin"},
+ name = "macos_aarch64",
+ constraint_values = [
+ "@platforms//os:macos",
+ "@platforms//cpu:aarch64",
+ ],
)
config_setting(
- name = "windows_x86_64",
- values = {"cpu": "x64_windows"},
-)
-
-config_setting(
- name = "android_legacy",
- values = {"crosstool_top": "//external:android/crosstool"},
-)
-
-config_setting(
- name = "android_stlport",
- values = {"crosstool_top": "@androidndk//:toolchain-stlport"},
-)
-
-config_setting(
- name = "android_libcpp",
- values = {"crosstool_top": "@androidndk//:toolchain-libcpp"},
-)
-
-config_setting(
- name = "android_gnu_libstdcpp",
- values = {"crosstool_top": "@androidndk//:toolchain-gnu-libstdcpp"},
-)
-
-config_setting(
- name = "android_default",
- values = {"crosstool_top": "@androidndk//:default_crosstool"},
+ name = "macos_x86_64",
+ constraint_values = [
+ "@platforms//os:macos",
+ "@platforms//cpu:x86_64",
+ ],
)
posix_copts = [
@@ -98,11 +89,6 @@
"-Wwrite-strings",
"-Wshadow",
"-fno-common",
-
- # Modern build environments should be able to set this to use atomic
- # operations for reference counting rather than locks. However, it's
- # known not to work on some Android builds.
- # "-DOPENSSL_C11_ATOMIC",
]
linux_copts = posix_copts + [
@@ -113,24 +99,29 @@
]
boringssl_copts = select({
- ":linux_aarch64": linux_copts,
- ":linux_ppc64le": linux_copts,
- ":linux_x86_64": linux_copts,
- ":mac_x86_64": posix_copts,
- ":windows_x86_64": [
- "-DWIN32_LEAN_AND_MEAN",
- "-DOPENSSL_NO_ASM",
- ],
- "//conditions:default": ["-DOPENSSL_NO_ASM"],
+ "@platforms//os:linux": linux_copts,
+ "@platforms//os:macos": posix_copts,
+ "@platforms//os:windows": ["-DWIN32_LEAN_AND_MEAN"],
+ "//conditions:default": [],
})
+# These selects must be kept in sync.
crypto_sources_asm = select({
":linux_aarch64": crypto_sources_linux_aarch64,
":linux_ppc64le": crypto_sources_linux_ppc64le,
":linux_x86_64": crypto_sources_linux_x86_64,
- ":mac_x86_64": crypto_sources_mac_x86_64,
+ ":macos_aarch64": crypto_sources_apple_aarch64,
+ ":macos_x86_64": crypto_sources_apple_x86_64,
"//conditions:default": [],
})
+boringssl_copts += select({
+ ":linux_aarch64": [],
+ ":linux_ppc64le": [],
+ ":linux_x86_64": [],
+ ":macos_aarch64": [],
+ ":macos_x86_64": [],
+ "//conditions:default": ["-DOPENSSL_NO_ASM"],
+})
# For C targets only (not C++), compile with C11 support.
posix_copts_c11 = [
@@ -141,10 +132,8 @@
]
boringssl_copts_c11 = boringssl_copts + select({
- ":linux_aarch64": posix_copts_c11,
- ":linux_ppc64le": posix_copts_c11,
- ":linux_x86_64": posix_copts_c11,
- ":mac_x86_64": posix_copts_c11,
+ "@platforms//os:linux": posix_copts_c11,
+ "@platforms//os:macos": posix_copts_c11,
"//conditions:default": [],
})
@@ -155,10 +144,8 @@
]
boringssl_copts_cxx = boringssl_copts + select({
- ":linux_aarch64": posix_copts_cxx,
- ":linux_ppc64le": posix_copts_cxx,
- ":linux_x86_64": posix_copts_cxx,
- ":mac_x86_64": posix_copts_cxx,
+ "@platforms//os:linux": posix_copts_cxx,
+ "@platforms//os:macos": posix_copts_cxx,
"//conditions:default": [],
})
@@ -171,13 +158,9 @@
linkopts = select({
# Android supports pthreads, but does not provide a libpthread
# to link against.
- ":android_legacy": [],
- ":android_stlport": [],
- ":android_libcpp": [],
- ":android_gnu_libstdcpp": [],
- ":android_default": [],
- ":mac_x86_64": [],
- ":windows_x86_64": ["-defaultlib:advapi32.lib"],
+ "@platforms//os:android": [],
+ "@platforms//os:macos": [],
+ "@platforms//os:windows": ["-defaultlib:advapi32.lib"],
"//conditions:default": ["-lpthread"],
}),
visibility = ["//visibility:public"],
diff --git a/src/util/bot/DEPS b/src/util/bot/DEPS
index e3c95f3..574d94b 100644
--- a/src/util/bot/DEPS
+++ b/src/util/bot/DEPS
@@ -187,7 +187,7 @@
'action': [ 'download_from_google_storage',
'--no_resume',
'--bucket', 'chrome-boringssl-sde',
- '-s', 'boringssl/util/bot/sde-linux64.tar.bz2.sha1'
+ '-s', 'boringssl/util/bot/sde-linux64.tar.xz.sha1'
],
},
{
@@ -196,7 +196,7 @@
'condition': 'checkout_sde and host_os == "linux"',
'action': [ 'python3',
'boringssl/util/bot/extract.py',
- 'boringssl/util/bot/sde-linux64.tar.bz2',
+ 'boringssl/util/bot/sde-linux64.tar.xz',
'boringssl/util/bot/sde-linux64/',
],
},
@@ -207,7 +207,7 @@
'action': [ 'download_from_google_storage',
'--no_resume',
'--bucket', 'chrome-boringssl-sde',
- '-s', 'boringssl/util/bot/sde-win32.tar.bz2.sha1'
+ '-s', 'boringssl/util/bot/sde-win32.tar.xz.sha1'
],
},
{
@@ -216,7 +216,7 @@
'condition': 'checkout_sde and host_os == "win"',
'action': [ 'python3',
'boringssl/util/bot/extract.py',
- 'boringssl/util/bot/sde-win32.tar.bz2',
+ 'boringssl/util/bot/sde-win32.tar.xz',
'boringssl/util/bot/sde-win32/',
],
},
diff --git a/src/util/bot/UPDATING b/src/util/bot/UPDATING
index 2e6b914..dad6192 100644
--- a/src/util/bot/UPDATING
+++ b/src/util/bot/UPDATING
@@ -46,13 +46,13 @@
The current revision is strawberry-perl-5.26.2.1-64bit-portable.zip.
-Finally, update sde-linux64.tar.bz2 and sde-win32.tar.bz2 by downloading the
+Finally, update sde-linux64.tar.xz and sde-win32.tar.xz by downloading the
latet release from Intel at
https://software.intel.com/en-us/articles/intel-software-development-emulator,
but upload it with the following command. (Note the bucket is different.)
- upload_to_google_storage.py -b chrome-boringssl-sde sde-linux64.tar.bz2 sde-win32.tar.bz2
+ upload_to_google_storage.py -b chrome-boringssl-sde sde-linux64.tar.xz sde-win32.tar.xz
-The current revision is sde-external-8.50.0-2020-03-26-*.tar.bz2.
+The current revision is sde-external-9.0.0-2021-11-07-*.tar.xz.
When adding new files, remember to update .gitignore.
diff --git a/src/util/bot/extract.py b/src/util/bot/extract.py
index 9b1b88a..4ef5f65 100644
--- a/src/util/bot/extract.py
+++ b/src/util/bot/extract.py
@@ -118,6 +118,8 @@
entries = IterateTar(archive, 'gz')
elif archive.endswith('.tar.bz2'):
entries = IterateTar(archive, 'bz2')
+ elif archive.endswith('.tar.xz'):
+ entries = IterateTar(archive, 'xz')
else:
raise ValueError(archive)
diff --git a/src/util/bot/sde-linux64.tar.bz2.sha1 b/src/util/bot/sde-linux64.tar.bz2.sha1
deleted file mode 100644
index c450f63..0000000
--- a/src/util/bot/sde-linux64.tar.bz2.sha1
+++ /dev/null
@@ -1 +0,0 @@
-baacb5a29755e299d3384c41c6dd55f65235ef1f
\ No newline at end of file
diff --git a/src/util/bot/sde-linux64.tar.xz.sha1 b/src/util/bot/sde-linux64.tar.xz.sha1
new file mode 100644
index 0000000..f9ee198
--- /dev/null
+++ b/src/util/bot/sde-linux64.tar.xz.sha1
@@ -0,0 +1 @@
+8bba6e01a47b2cfd9e7429f77256db540031ff43
\ No newline at end of file
diff --git a/src/util/bot/sde-win32.tar.bz2.sha1 b/src/util/bot/sde-win32.tar.bz2.sha1
deleted file mode 100644
index b960747..0000000
--- a/src/util/bot/sde-win32.tar.bz2.sha1
+++ /dev/null
@@ -1 +0,0 @@
-cc2d77ff4a221165a8bb13f43ccfbff6550b90c8
\ No newline at end of file
diff --git a/src/util/bot/sde-win32.tar.xz.sha1 b/src/util/bot/sde-win32.tar.xz.sha1
new file mode 100644
index 0000000..dbaf87f
--- /dev/null
+++ b/src/util/bot/sde-win32.tar.xz.sha1
@@ -0,0 +1 @@
+59ef225031e14e5ac257ada61d416f6ea0c9c080
\ No newline at end of file
diff --git a/src/util/doc.go b/src/util/doc.go
index a38e078..651998e 100644
--- a/src/util/doc.go
+++ b/src/util/doc.go
@@ -503,7 +503,7 @@
// markupPipeWords converts |s| into an HTML string, safe to be included outside
// a tag, while also marking up words surrounded by |.
-func markupPipeWords(allDecls map[string]string, s string) template.HTML {
+func markupPipeWords(allDecls map[string]string, s string, linkDecls bool) template.HTML {
// It is safe to look for '|' in the HTML-escaped version of |s|
// below. The escaped version cannot include '|' instead tags because
// there are no tags by construction.
@@ -524,12 +524,10 @@
if i > 0 && (j == -1 || j > i) {
ret += "<tt>"
anchor, isLink := allDecls[s[:i]]
- if isLink {
- ret += fmt.Sprintf("<a href=\"%s\">", template.HTMLEscapeString(anchor))
- }
- ret += s[:i]
- if isLink {
- ret += "</a>"
+ if linkDecls && isLink {
+ ret += fmt.Sprintf("<a href=\"%s\">%s</a>", template.HTMLEscapeString(anchor), s[:i])
+ } else {
+ ret += s[:i]
}
ret += "</tt>"
s = s[i+1:]
@@ -602,11 +600,12 @@
headerTmpl := template.New("headerTmpl")
headerTmpl.Funcs(template.FuncMap{
- "firstSentence": firstSentence,
- "markupPipeWords": func(s string) template.HTML { return markupPipeWords(allDecls, s) },
- "markupFirstWord": markupFirstWord,
- "markupRFC": markupRFC,
- "newlinesToBR": newlinesToBR,
+ "firstSentence": firstSentence,
+ "markupPipeWords": func(s string) template.HTML { return markupPipeWords(allDecls, s, true /* linkDecls */) },
+ "markupPipeWordsNoLink": func(s string) template.HTML { return markupPipeWords(allDecls, s, false /* linkDecls */) },
+ "markupFirstWord": markupFirstWord,
+ "markupRFC": markupRFC,
+ "newlinesToBR": newlinesToBR,
})
headerTmpl, err := headerTmpl.Parse(`<!DOCTYPE html>
<html>
@@ -623,12 +622,12 @@
<a href="headers.html">All headers</a>
</div>
- {{range .Preamble}}<p>{{. | markupPipeWords}}</p>{{end}}
+ {{range .Preamble}}<p>{{. | markupPipeWords | markupRFC}}</p>{{end}}
<ol>
{{range .Sections}}
{{if not .IsPrivate}}
- {{if .Anchor}}<li class="header"><a href="#{{.Anchor}}">{{.Preamble | firstSentence | markupPipeWords}}</a></li>{{end}}
+ {{if .Anchor}}<li class="header"><a href="#{{.Anchor}}">{{.Preamble | firstSentence | markupPipeWordsNoLink}}</a></li>{{end}}
{{range .Decls}}
{{if .Anchor}}<li><a href="#{{.Anchor}}"><tt>{{.Name}}</tt></a></li>{{end}}
{{end}}
@@ -641,7 +640,7 @@
<div class="section" {{if .Anchor}}id="{{.Anchor}}"{{end}}>
{{if .Preamble}}
<div class="sectionpreamble">
- {{range .Preamble}}<p>{{. | markupPipeWords}}</p>{{end}}
+ {{range .Preamble}}<p>{{. | markupPipeWords | markupRFC}}</p>{{end}}
</div>
{{end}}
diff --git a/src/util/fipstools/CMakeLists.txt b/src/util/fipstools/CMakeLists.txt
new file mode 100644
index 0000000..6359383
--- /dev/null
+++ b/src/util/fipstools/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(../../include)
+
+if(FIPS)
+ add_executable(
+ test_fips
+
+ test_fips.c
+ )
+
+ add_dependencies(test_fips global_target)
+ target_link_libraries(test_fips crypto)
+endif()
diff --git a/src/util/fipstools/acvp/acvptool/acvp/acvp.go b/src/util/fipstools/acvp/acvptool/acvp/acvp.go
index 04f0932..9419508 100644
--- a/src/util/fipstools/acvp/acvptool/acvp/acvp.go
+++ b/src/util/fipstools/acvp/acvptool/acvp/acvp.go
@@ -33,6 +33,8 @@
"time"
)
+const loginEndpoint = "acvp/v1/login"
+
// Server represents an ACVP server.
type Server struct {
// PrefixTokens are access tokens that apply to URLs under a certain prefix.
@@ -239,7 +241,7 @@
if json.Unmarshal(jsonBytes, &token) != nil {
return false
}
- return token.Expiry > 0 && token.Expiry < uint64(time.Now().Unix())
+ return token.Expiry > 0 && token.Expiry < uint64(time.Now().Add(-10*time.Second).Unix())
}
func (server *Server) getToken(endPoint string) (string, error) {
@@ -255,7 +257,7 @@
var reply struct {
AccessToken string `json:"accessToken"`
}
- if err := server.postMessage(&reply, "acvp/v1/login", map[string]string{
+ if err := server.postMessage(&reply, loginEndpoint, map[string]string{
"password": server.totpFunc(),
"accessToken": token,
}); err != nil {
@@ -278,7 +280,7 @@
SizeLimit int64 `json:"sizeConstraint"`
}
- if err := server.postMessage(&reply, "acvp/v1/login", map[string]string{"password": server.totpFunc()}); err != nil {
+ if err := server.postMessage(&reply, loginEndpoint, map[string]string{"password": server.totpFunc()}); err != nil {
return err
}
@@ -372,7 +374,7 @@
if err != nil {
return nil, err
}
- if len(token) != 0 {
+ if len(token) != 0 && endpoint != loginEndpoint {
req.Header.Add("Authorization", "Bearer "+token)
}
return req, nil
diff --git a/src/util/fipstools/break-kat.go b/src/util/fipstools/break-kat.go
new file mode 100644
index 0000000..b500545
--- /dev/null
+++ b/src/util/fipstools/break-kat.go
@@ -0,0 +1,89 @@
+// break-kat corrupts a known-answer-test input in a binary and writes the
+// corrupted binary to stdout. This is used to demonstrate that the KATs in the
+// binary notice the error.
+package main
+
+import (
+ "bytes"
+ "encoding/hex"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "os"
+ "sort"
+)
+
+var (
+ kats = map[string]string{
+ "HMAC-SHA-256": "dad91293dfcf2a7c8ecd13fe353fa75b",
+ "AES-CBC-encrypt": "078609a6c5ac2544699adf682fa377f9be8ab6aef563e8c56a36b84f557fadd3",
+ "AES-CBC-decrypt": "347aa5a024b28257b36510be583d4f47adb7bbeedc6005bbbd0d0a9f06bb7b10",
+ "AES-GCM-encrypt": "8fcc4099808e75caaff582898848a88d808b55ab4e9370797d940be8cc1d7884",
+ "AES-GCM-decrypt": "35f3058f875760ff09d3120f70c4bc9ed7a86872e13452202176f7371ae04faae1dd391920f5d13953d896785994823c",
+ "DRBG": "c4da0740d505f1ee280b95e58c4931ac6de846a0152fbb4a3f174cf4787a4f1a40c2b50babe14aae530be5886d910a27",
+ "DRBG-reseed": "c7161ca36c2309b716e9859bb96c6d49bdc8352103a18cd24ef42ec97ef46bf446eb1a4576c186e9351803763a7912fe",
+ "SHA-1": "132fd9bad5c1826263bafbb699f707a5",
+ "SHA-256": "ff3b857da7236a2baa0f396b51522217",
+ "SHA-512": "212512f8d2ad8322781c6c4d69a9daa1",
+ "TLS-KDF": "abc3657b094c7628a0b282996fe75a75f4984fd94d4ecc2fcf53a2c469a3f731",
+ "RSA-sign": "d2b56e53306f720d7929d8708bf46f1c22300305582b115bedcac722d8aa5ab2",
+ "RSA-verify": "abe2cbc13d6bd39d48db5334ddbf8d070a93bdcb104e2cc5d0ee486ee295f6b31bda126c41890b98b73e70e6b65d82f95c663121755a90744c8d1c21148a1960be0eca446e9ff497f1345c537ef8119b9a4398e95c5c6de2b1c955905c5299d8ce7a3b6ab76380d9babdd15f610237e1f3f2aa1c1f1e770b62fbb596381b2ebdd77ecef9c90d4c92f7b6b05fed2936285fa94826e62055322a33b6f04c74ce69e5d8d737fb838b79d2d48e3daf71387531882531a95ac964d02ea413bf85952982bbc089527daff5b845c9a0f4d14ef1956d9c3acae882d12da66da0f35794f5ee32232333517db9315232a183b991654dbea41615345c885325926744a53915",
+ "ECDSA-sign": "1e35930be860d0942ca7bbd6f6ded87f157e4de24f81ed4b875c0e018e89a81f",
+ "ECDSA-verify": "6780c5fc70275e2c7061a0e7877bb174deadeb9887027f3fa83654158ba7f50c2d36e5799790bfbe2183d33e96f3c51f6a232f2a24488c8e5f64c37ea2cf0529",
+ "Z-computation": "e7604491269afb5b102d6ea52cb59feb70aede6ce3bfb3e0105485abd861d77b",
+ "FFDH": "a14f8ad36be37b18b8f35864392f150ab7ee22c47e1870052a3f17918274af18aaeaf4cf6aacfde96c9d586eb7ebaff6b03fe3b79a8e2ff9dd6df34caaf2ac70fd3771d026b41a561ee90e4337d0575f8a0bd160c868e7e3cef88aa1d88448b1e4742ba11480a9f8a8b737347c408d74a7d57598c48875629df0c85327a124ddec1ad50cd597a985588434ce19c6f044a1696b5f244b899b7e77d4f6f20213ae8eb15d37eb8e67e6c8bdbc4fd6e17426283da96f23a897b210058c7c70fb126a5bf606dbeb1a6d5cca04184c4e95c2e8a70f50f5c1eabd066bd79c180456316ac02d366eb3b0e7ba82fb70dcbd737ca55734579dd250fffa8e0584be99d32b35",
+ }
+
+ listTests = flag.Bool("list-tests", false, "List known test values and exit")
+)
+
+func main() {
+ flag.Parse()
+
+ if *listTests {
+ for _, kat := range sortedKATs() {
+ fmt.Println(kat)
+ }
+ os.Exit(0)
+ }
+
+ if flag.NArg() != 2 || kats[flag.Arg(1)] == "" {
+ fmt.Fprintln(os.Stderr, "Usage: break-kat <binary path> <test to break> > output")
+ fmt.Fprintln(os.Stderr, "Possible values for <test to break>:")
+ for _, kat := range sortedKATs() {
+ fmt.Fprintln(os.Stderr, " ", kat)
+ }
+ os.Exit(1)
+ }
+
+ inPath := flag.Arg(0)
+ test := flag.Arg(1)
+ testInputValue, err := hex.DecodeString(kats[test])
+ if err != nil {
+ panic("invalid kat data: " + err.Error())
+ }
+
+ binaryContents, err := ioutil.ReadFile(inPath)
+ if err != nil {
+ fmt.Fprintln(os.Stderr, err)
+ os.Exit(2)
+ }
+
+ i := bytes.Index(binaryContents, testInputValue)
+ if i < 0 {
+ fmt.Fprintln(os.Stderr, "Expected test input value was not found in binary.")
+ os.Exit(3)
+ }
+
+ binaryContents[i] ^= 1
+ os.Stdout.Write(binaryContents)
+}
+
+func sortedKATs() []string {
+ var ret []string
+ for kat := range kats {
+ ret = append(ret, kat)
+ }
+ sort.Strings(ret)
+ return ret
+}
diff --git a/src/util/fipstools/break-tests-android.sh b/src/util/fipstools/break-tests-android.sh
deleted file mode 100644
index efb166e..0000000
--- a/src/util/fipstools/break-tests-android.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2019, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-# This script exists to exercise breaking each of the FIPS tests on an Android
-# device. Since, on Android, BoringCrypto exists in both 32- and 64-bit
-# versions, the first argument must be either "32" or "64" to select which is
-# being tested. The Android source tree must have been setup (with "lunch") for
-# a matching build configuration before using this script to build the
-# binaries. (Although it'll fail non-silently if there's a mismatch.)
-#
-# Since each test needs the FIPS module to be compiled differently, and that
-# can take a long time, this script is run twice: once with "build" as the
-# second argument to run the builds, and then with "run" as the second argument
-# to run each test.
-#
-# Run it with /bin/bash, not /bin/sh, otherwise "read" may fail.
-#
-# In order to reconfigure the build for each test, it needs to set a define. It
-# does so by rewriting a template in external/boringssl/Android.bp and you must
-# add the template value before doing the builds. To do so, insert
-# -DBORINGSSL_FIPS_BREAK_XXX=1 in the cflags list for the module, probably by
-# putting it in the "boringssl_flags" stanza.
-
-set -x
-set -e
-
-if [ ! -f external/boringssl/Android.bp ]; then
- echo "Must be run from the top-level of an Android source tree."
- exit 1
-fi
-
-. build/envsetup.sh
-
-TESTS="NONE ECDSA_PWCT CRNG RSA_PWCT AES_CBC AES_GCM DES SHA_1 SHA_256 SHA_512 RSA_SIG DRBG ECDSA_SIG Z_COMPUTATION TLS_KDF FFC_DH"
-
-if [ "x$1" = "x32" ]; then
- lib="lib"
- bits="32"
-elif [ "x$1" = "x64" ] ; then
- lib="lib64"
- bits="64"
-else
- echo "First argument must be 32 or 64"
- exit 1
-fi
-
-if [ "x$2" = "xbuild" ]; then
- if ! grep -q DBORINGSSL_FIPS_BREAK_XXX=1 external/boringssl/Android.bp; then
- echo "Missing DBORINGSSL_FIPS_BREAK_XXX in external/boringssl/Android.bp. Edit the file and insert -DBORINGSSL_FIPS_BREAK_XXX=1 in the cflags for the FIPS module"
- exit 1
- fi
-
- printf "\\x1b[1mBuilding modules\\x1b[0m\n"
- for test in $TESTS; do
- printf "\\x1b[1mBuilding for ${test}\\x1b[0m\n"
- cp external/boringssl/Android.bp external/boringssl/Android.bp.orig
- sed -i -e "s/DBORINGSSL_FIPS_BREAK_XXX/DBORINGSSL_FIPS_BREAK_${test}/" external/boringssl/Android.bp
- m test_fips
- dir=test-${bits}-${test}
- rm -Rf $dir
- mkdir $dir
- cp ${ANDROID_PRODUCT_OUT}/system/${lib}/libcrypto.so $dir
- cp ${ANDROID_PRODUCT_OUT}/system/bin/test_fips $dir
- if [ $bits = "32" ] ; then
- if ! file ${dir}/test_fips | grep -q "32-bit" ; then
- echo "32-bit build requested but binaries don't appear to be 32-bit:"
- file ${dir}/test_fips
- exit 1
- fi
- else
- if ! file ${dir}/test_fips | grep -q "64-bit" ; then
- echo "64-bit build requested but binaries don't appear to be 64-bit:"
- file ${dir}/test_fips
- exit 1
- fi
- fi
- cp external/boringssl/Android.bp.orig external/boringssl/Android.bp
- done
-elif [ "x$2" = "xrun" ]; then
- printf "\\x1b[1mTesting\\x1b[0m\n"
- for test in $TESTS; do
- dir=test-${bits}-${test}
- if [ ! '(' -d ${dir} -a -f ${dir}/test_fips -a -f ${dir}/libcrypto.so ')' ] ; then
- echo "Build directory ${dir} is missing or is missing files"
- exit 1
- fi
- adb push ${dir}/* /data/local/tmp
- printf "\\x1b[1mTesting ${test}\\x1b[0m\n"
- adb shell -n -t -x LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/test_fips
- read
- done
-
- printf "\\x1b[1mTesting integrity}\\x1b[0m\n"
- src=test-${bits}-NONE
- dir=test-${bits}-INT
- rm -Rf $dir
- mkdir $dir
- go run external/boringssl/src/util/fipstools/break-hash.go ${src}/libcrypto.so ${dir}/libcrypto.so
- cp ${src}/test_fips $dir
- adb push ${dir}/* /data/local/tmp
- adb shell -n -t -x LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/test_fips
- read
-else
- echo "Second argument must be build or run"
- exit 1
-fi
diff --git a/src/util/fipstools/break-tests.sh b/src/util/fipstools/break-tests.sh
deleted file mode 100644
index 84c24ee..0000000
--- a/src/util/fipstools/break-tests.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2018, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-# This script exists to exercise breaking each of the FIPS tests. It builds
-# BoringSSL differently for each test and that can take a long time. Thus it's
-# run twice: once, from a BoringSSL source tree, with "build" as the sole
-# argument to run the builds, and then (from the same location) with no
-# arguments to run each script.
-#
-# Run it with /bin/bash, not /bin/sh, otherwise "read" may fail.
-
-set -x
-
-TESTS="NONE ECDSA_PWCT CRNG RSA_PWCT AES_CBC AES_GCM DES SHA_1 SHA_256 SHA_512 RSA_SIG DRBG ECDSA_SIG Z_COMPUTATION TLS_KDF FFC_DH"
-
-if [ "x$1" = "xbuild" ]; then
- for test in $TESTS; do
- rm -Rf build-$test
- mkdir build-$test
- pushd build-$test
- cmake -GNinja -DCMAKE_TOOLCHAIN_FILE=${HOME}/toolchain -DFIPS=1 -DFIPS_BREAK_TEST=${test} -DCMAKE_BUILD_TYPE=Release ..
- ninja test_fips
- popd
- done
-
- exit 0
-fi
-
-for test in $TESTS; do
- pushd build-$test
- printf "\n\n\\x1b[1m$test\\x1b[0m\n"
- ./util/fipstools/cavp/test_fips
- echo "Waiting for keypress..."
- read
- popd
-done
-
-pushd build-NONE
-printf "\\x1b[1mIntegrity\\x1b[0m\n"
-go run ../util/fipstools/break-hash.go ./util/fipstools/cavp/test_fips ./util/fipstools/cavp/test_fips_broken
-./util/fipstools/cavp/test_fips_broken
-popd
diff --git a/src/util/fipstools/cavp/CMakeLists.txt b/src/util/fipstools/cavp/CMakeLists.txt
deleted file mode 100644
index a50c9ab..0000000
--- a/src/util/fipstools/cavp/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-include_directories(../../../include)
-
-if(FIPS)
- add_executable(
- cavp
-
- cavp_main.cc
-
- cavp_aes_gcm_test.cc
- cavp_aes_test.cc
- cavp_ctr_drbg_test.cc
- cavp_ecdsa2_keypair_test.cc
- cavp_ecdsa2_pkv_test.cc
- cavp_ecdsa2_siggen_test.cc
- cavp_ecdsa2_sigver_test.cc
- cavp_hmac_test.cc
- cavp_kas_test.cc
- cavp_keywrap_test.cc
- cavp_rsa2_keygen_test.cc
- cavp_rsa2_siggen_test.cc
- cavp_rsa2_sigver_test.cc
- cavp_sha_monte_test.cc
- cavp_sha_test.cc
- cavp_tdes_test.cc
- cavp_tlskdf_test.cc
-
- cavp_test_util.cc
- )
-
- add_dependencies(cavp global_target)
-
- add_executable(
- test_fips
-
- test_fips.c
- )
-
- add_dependencies(test_fips global_target)
-
- target_link_libraries(cavp test_support_lib crypto)
- target_link_libraries(test_fips test_support_lib crypto)
-endif()
diff --git a/src/util/fipstools/cavp/cavp_aes_gcm_test.cc b/src/util/fipstools/cavp/cavp_aes_gcm_test.cc
deleted file mode 100644
index 6ee991d..0000000
--- a/src/util/fipstools/cavp/cavp_aes_gcm_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_aes_gcm_test processes a NIST CAVP AES GCM test vector request file and
-// emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/aead.h>
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- const EVP_AEAD *aead;
-};
-
-}
-
-static const EVP_AEAD *GetAEAD(const std::string &name, const bool enc) {
- if (name == "aes-128-gcm") {
- return EVP_aead_aes_128_gcm();
- } else if (name == "aes-192-gcm") {
- return EVP_aead_aes_192_gcm();
- } else if (name == "aes-256-gcm") {
- return EVP_aead_aes_256_gcm();
- }
- return nullptr;
-}
-
-static bool TestAEADEncrypt(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- std::string key_len_str, iv_len_str, pt_len_str, aad_len_str, tag_len_str;
- if (!t->GetInstruction(&key_len_str, "Keylen") ||
- !t->GetInstruction(&iv_len_str, "IVlen") ||
- !t->GetInstruction(&pt_len_str, "PTlen") ||
- !t->GetInstruction(&aad_len_str, "AADlen") ||
- !t->GetInstruction(&tag_len_str, "Taglen")) {
- return false;
- }
-
- std::string count;
- std::vector<uint8_t> key, iv, pt, aad, tag, ct;
- if (!t->GetAttribute(&count, "Count") ||
- !t->GetBytes(&key, "Key") ||
- !t->GetBytes(&iv, "IV") ||
- !t->GetBytes(&pt, "PT") ||
- !t->GetBytes(&aad, "AAD") ||
- key.size() * 8 != strtoul(key_len_str.c_str(), nullptr, 0) ||
- iv.size() * 8 != strtoul(iv_len_str.c_str(), nullptr, 0) ||
- pt.size() * 8 != strtoul(pt_len_str.c_str(), nullptr, 0) ||
- aad.size() * 8 != strtoul(aad_len_str.c_str(), nullptr, 0) ||
- iv.size() != 12) {
- return false;
- }
-
- const size_t tag_len = strtoul(tag_len_str.c_str(), nullptr, 0) / 8;
- if (!AEADEncrypt(ctx->aead, &ct, &tag, tag_len, key, pt, aad, iv)) {
- return false;
- }
- printf("%s", t->CurrentTestToString().c_str());
- printf("CT = %s\r\n", EncodeHex(ct).c_str());
- printf("Tag = %s\r\n\r\n", EncodeHex(tag).c_str());
-
- return true;
-}
-
-static bool TestAEADDecrypt(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- std::string key_len, iv_len, pt_len_str, aad_len_str, tag_len;
- if (!t->GetInstruction(&key_len, "Keylen") ||
- !t->GetInstruction(&iv_len, "IVlen") ||
- !t->GetInstruction(&pt_len_str, "PTlen") ||
- !t->GetInstruction(&aad_len_str, "AADlen") ||
- !t->GetInstruction(&tag_len, "Taglen")) {
- t->PrintLine("Invalid instruction block.");
- return false;
- }
- size_t aad_len = strtoul(aad_len_str.c_str(), nullptr, 0) / 8;
- size_t pt_len = strtoul(pt_len_str.c_str(), nullptr, 0) / 8;
-
- std::string count;
- std::vector<uint8_t> key, iv, ct, aad, tag, pt;
- if (!t->GetAttribute(&count, "Count") ||
- !t->GetBytes(&key, "Key") ||
- !t->GetBytes(&aad, "AAD") ||
- !t->GetBytes(&tag, "Tag") ||
- !t->GetBytes(&iv, "IV") ||
- !t->GetBytes(&ct, "CT") ||
- key.size() * 8 != strtoul(key_len.c_str(), nullptr, 0) ||
- iv.size() * 8 != strtoul(iv_len.c_str(), nullptr, 0) ||
- ct.size() != pt_len ||
- aad.size() != aad_len ||
- tag.size() * 8 != strtoul(tag_len.c_str(), nullptr, 0)) {
- t->PrintLine("Invalid test case");
- return false;
- }
-
- printf("%s", t->CurrentTestToString().c_str());
- bool aead_result =
- AEADDecrypt(ctx->aead, &pt, pt_len, key, aad, ct, tag, iv);
- if (aead_result) {
- printf("PT = %s\r\n\r\n", EncodeHex(pt).c_str());
- } else {
- printf("FAIL\r\n\r\n");
- }
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s (enc|dec) <cipher> <test file>\n", arg);
- return 1;
-}
-
-int cavp_aes_gcm_test_main(int argc, char **argv) {
- if (argc != 4) {
- return usage(argv[0]);
- }
-
- const std::string mode(argv[1]);
- bool (*test_fn)(FileTest * t, void *arg);
- if (mode == "enc") {
- test_fn = &TestAEADEncrypt;
- } else if (mode == "dec") {
- test_fn = &TestAEADDecrypt;
- } else {
- return usage(argv[0]);
- }
-
- const EVP_AEAD *aead = GetAEAD(argv[2], mode == "enc");
- if (aead == nullptr) {
- fprintf(stderr, "invalid aead: %s\n", argv[2]);
- return 1;
- }
-
- TestCtx ctx = {aead};
-
- FileTest::Options opts;
- opts.path = argv[3];
- opts.callback = test_fn;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_aes_test.cc b/src/util/fipstools/cavp/cavp_aes_test.cc
deleted file mode 100644
index d1f49b4..0000000
--- a/src/util/fipstools/cavp/cavp_aes_test.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_aes_test processes a NIST CAVP AES test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- const EVP_CIPHER *cipher;
- bool has_iv;
- enum Mode {
- kKAT, // Known Answer Test
- kMCT, // Monte Carlo Test
- };
- Mode mode;
-};
-
-}
-
-static bool MonteCarlo(const TestCtx *ctx, FileTest *t,
- const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
- bool encrypt, std::vector<uint8_t> key,
- std::vector<uint8_t> iv, std::vector<uint8_t> in) {
- const std::string in_label = encrypt ? "PLAINTEXT" : "CIPHERTEXT",
- result_label = encrypt ? "CIPHERTEXT" : "PLAINTEXT";
- std::vector<uint8_t> prev_result, result, prev_in;
- for (int i = 0; i < 100; i++) {
- printf("COUNT = %d\r\nKEY = %s\r\n", i, EncodeHex(key).c_str());
- if (ctx->has_iv) {
- printf("IV = %s\r\n", EncodeHex(iv).c_str());
- }
- printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
-
- if (!ctx->has_iv) { // ECB mode
- for (int j = 0; j < 1000; j++) {
- prev_result = result;
- if (!CipherOperation(cipher, &result, encrypt, key, iv, in)) {
- return false;
- }
- in = result;
- }
- } else {
- for (int j = 0; j < 1000; j++) {
- prev_result = result;
- if (j > 0) {
- if (encrypt) {
- iv = result;
- } else {
- iv = prev_in;
- }
- }
-
- if (!CipherOperation(cipher, &result, encrypt, key, iv, in)) {
- return false;
- }
-
- prev_in = in;
-
- if (j == 0) {
- in = iv;
- } else {
- in = prev_result;
- }
- }
- }
-
- printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
- const size_t key_len = key.size() * 8;
- if (key_len == 128) {
- for (size_t k = 0; k < key.size(); k++) {
- key[k] ^= result[k];
- }
- } else if (key_len == 192) {
- for (size_t k = 0; k < key.size(); k++) {
- // Key[i+1] = Key[i] xor (last 64-bits of CT[j-1] || CT[j])
- if (k < 8) {
- key[k] ^= prev_result[prev_result.size() - 8 + k];
- } else {
- key[k] ^= result[k - 8];
- }
- }
- } else { // key_len == 256
- for (size_t k = 0; k < key.size(); k++) {
- // Key[i+1] = Key[i] xor (CT[j-1] || CT[j])
- if (k < 16) {
- key[k] ^= prev_result[k];
- } else {
- key[k] ^= result[k - 16];
- }
- }
- }
-
- if (ctx->has_iv) {
- iv = result;
- in = prev_result;
- } else {
- in = result;
- }
- }
-
- return true;
-}
-
-static bool TestCipher(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
- t->PrintLine("Want either ENCRYPT or DECRYPT");
- return false;
- }
- enum {
- kEncrypt,
- kDecrypt,
- } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
- std::string count;
- std::vector<uint8_t> key, iv, in, result;
- if (!t->GetAttribute(&count, "COUNT") ||
- !t->GetBytes(&key, "KEY") ||
- (ctx->has_iv && !t->GetBytes(&iv, "IV"))) {
- return false;
- }
-
- const EVP_CIPHER *cipher = ctx->cipher;
- if (operation == kEncrypt) {
- if (!t->GetBytes(&in, "PLAINTEXT")) {
- return false;
- }
- } else { // operation == kDecrypt
- if (!t->GetBytes(&in, "CIPHERTEXT")) {
- return false;
- }
- }
-
- if (ctx->mode == TestCtx::kKAT) {
- if (!CipherOperation(cipher, &result, operation == kEncrypt, key, iv, in)) {
- return false;
- }
- const std::string label =
- operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
- printf("%s%s = %s\r\n\r\n", t->CurrentTestToString().c_str(), label.c_str(),
- EncodeHex(result).c_str());
- } else { // ctx->mode == kMCT
- const std::string op_label =
- operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
- printf("%s\r\n\r\n", op_label.c_str());
- if (!MonteCarlo(ctx, t, cipher, &result, operation == kEncrypt, key, iv,
- in)) {
- return false;
- }
- if (operation == kEncrypt) {
- // MCT tests contain a stray blank line after the ENCRYPT section.
- printf("\r\n");
- }
- }
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s (kat|mct) <cipher> <test file>\n", arg);
- return 1;
-}
-
-int cavp_aes_test_main(int argc, char **argv) {
- if (argc != 4) {
- return usage(argv[0]);
- }
-
- const std::string tm(argv[1]);
- enum TestCtx::Mode test_mode;
- if (tm == "kat") {
- test_mode = TestCtx::kKAT;
- } else if (tm == "mct") {
- test_mode = TestCtx::kMCT;
- } else {
- fprintf(stderr, "invalid test_mode: %s\n", tm.c_str());
- return usage(argv[0]);
- }
-
- const std::string cipher_name(argv[2]);
- const EVP_CIPHER *cipher = GetCipher(argv[2]);
- if (cipher == nullptr) {
- fprintf(stderr, "invalid cipher: %s\n", argv[2]);
- return 1;
- }
- const bool has_iv =
- (cipher_name != "aes-128-ecb" &&
- cipher_name != "aes-192-ecb" &&
- cipher_name != "aes-256-ecb");
-
- TestCtx ctx = {cipher, has_iv, test_mode};
-
- FileTest::Options opts;
- opts.path = argv[3];
- opts.callback = TestCipher;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc b/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc
deleted file mode 100644
index a27736e..0000000
--- a/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ctr_drbg_test processes a NIST CAVP DRBG800-90A test vector request
-// file and emits the corresponding response.
-
-#include <openssl/crypto.h>
-
-#include <stdlib.h>
-
-#include "cavp_test_util.h"
-#include "../crypto/fipsmodule/rand/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-
-
-static bool TestCTRDRBG(FileTest *t, void *arg) {
- std::string test_type, prediction_resistance, entropy_input_len, nonce_len,
- personalization_str_len, additional_input_len, returned_bits_len;
- if (!t->GetInstruction(&test_type, "AES-256 no df") ||
- !t->GetInstruction(&prediction_resistance, "PredictionResistance") ||
- !t->GetInstruction(&entropy_input_len, "EntropyInputLen") ||
- !t->GetInstruction(&nonce_len, "NonceLen") ||
- !t->GetInstruction(&personalization_str_len,
- "PersonalizationStringLen") ||
- !t->GetInstruction(&additional_input_len, "AdditionalInputLen") ||
- !t->GetInstruction(&returned_bits_len, "ReturnedBitsLen") ||
- !test_type.empty() ||
- prediction_resistance != "False" ||
- strtoul(entropy_input_len.c_str(), nullptr, 0) !=
- CTR_DRBG_ENTROPY_LEN * 8 ||
- nonce_len != "0") {
- return false;
- }
-
- std::string count;
- std::vector<uint8_t> entropy, nonce, personalization_str, ai1, ai2;
- if (!t->GetAttribute(&count, "COUNT") ||
- !t->GetBytes(&entropy, "EntropyInput") ||
- !t->GetBytes(&nonce, "Nonce") ||
- !t->GetBytes(&personalization_str, "PersonalizationString") ||
- !t->GetBytes(&ai1, "AdditionalInput") ||
- !t->GetBytes(&ai2, "AdditionalInput/2") ||
- entropy.size() * 8 != strtoul(entropy_input_len.c_str(), nullptr, 0) ||
- nonce.size() != 0 ||
- personalization_str.size() * 8 !=
- strtoul(personalization_str_len.c_str(), nullptr, 0) ||
- ai1.size() != ai2.size() ||
- ai1.size() * 8 != strtoul(additional_input_len.c_str(), nullptr, 0)) {
- return false;
- }
-
- CTR_DRBG_STATE drbg;
- CTR_DRBG_init(&drbg, entropy.data(),
- personalization_str.size() > 0 ? personalization_str.data()
- : nullptr,
- personalization_str.size());
-
- uint64_t out_len = strtoul(returned_bits_len.c_str(), nullptr, 0);
- if (out_len == 0 || (out_len & 7) != 0) {
- return false;
- }
- out_len /= 8;
-
- std::vector<uint8_t> out;
- out.resize(out_len);
-
- CTR_DRBG_generate(&drbg, out.data(), out.size(),
- ai1.size() > 0 ? ai1.data() : nullptr, ai1.size());
- CTR_DRBG_generate(&drbg, out.data(), out.size(),
- ai2.size() > 0 ? ai2.data() : nullptr, ai2.size());
-
- printf("%s", t->CurrentTestToString().c_str());
- printf("ReturnedBits = %s\r\n\r\n", EncodeHex(out).c_str());
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s <test file>\n", arg);
- return 1;
-}
-
-int cavp_ctr_drbg_test_main(int argc, char **argv) {
- if (argc != 2) {
- return usage(argv[0]);
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestCTRDRBG;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc
deleted file mode 100644
index f8c4a01..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_keypair_test processes a NIST CAVP ECDSA2 KeyPair test vector
-// request file and emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2KeyPair(FileTest *t, void *arg) {
- std::string n_str;
- const char *group_str;
- int nid = GetECGroupNIDFromInstruction(t, &group_str);
- if (nid == NID_undef ||
- !t->GetAttribute(&n_str, "N")) {
- return false;
- }
-
- // Don't use CurrentTestToString to avoid printing the N.
- printf(
- "[%s]\r\n\r\n[B.4.2 Key Pair Generation by Testing Candidates]\r\n\r\n",
- group_str);
-
- unsigned long n = strtoul(n_str.c_str(), nullptr, 10);
- for (unsigned long i = 0; i < n; i++) {
- bssl::UniquePtr<BIGNUM> qx(BN_new()), qy(BN_new());
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- if (!key ||
- !EC_KEY_generate_key_fips(key.get()) ||
- !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
- EC_KEY_get0_public_key(key.get()),
- qx.get(), qy.get(), nullptr)) {
- return false;
- }
-
- size_t degree_len =
- (EC_GROUP_get_degree(EC_KEY_get0_group(key.get())) + 7) / 8;
- size_t order_len =
- BN_num_bytes(EC_GROUP_get0_order(EC_KEY_get0_group(key.get())));
- std::vector<uint8_t> qx_bytes(degree_len), qy_bytes(degree_len);
- std::vector<uint8_t> d_bytes(order_len);
- if (!BN_bn2bin_padded(qx_bytes.data(), qx_bytes.size(), qx.get()) ||
- !BN_bn2bin_padded(qy_bytes.data(), qy_bytes.size(), qy.get()) ||
- !BN_bn2bin_padded(d_bytes.data(), d_bytes.size(),
- EC_KEY_get0_private_key(key.get()))) {
- return false;
- }
-
- printf("d = %s\r\nQx = %s\r\nQy = %s\r\n\r\n", EncodeHex(d_bytes).c_str(),
- EncodeHex(qx_bytes).c_str(), EncodeHex(qy_bytes).c_str());
- }
-
- return true;
-}
-
-int cavp_ecdsa2_keypair_test_main(int argc, char **argv) {
- if (argc != 2) {
- fprintf(stderr, "usage: %s <test file>\n",
- argv[0]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestECDSA2KeyPair;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc
deleted file mode 100644
index d823e7a..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_pkv_test processes a NIST CAVP ECDSA2 PKV test vector request file
-// and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2PKV(FileTest *t, void *arg) {
- int nid = GetECGroupNIDFromInstruction(t);
- if (nid == NID_undef) {
- return false;
- }
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- bssl::UniquePtr<BIGNUM> qx = GetBIGNUM(t, "Qx");
- bssl::UniquePtr<BIGNUM> qy = GetBIGNUM(t, "Qy");
- if (!key || !qx || !qy) {
- return false;
- }
-
- if (EC_KEY_set_public_key_affine_coordinates(key.get(), qx.get(), qy.get())) {
- printf("%sResult = P\r\n\r\n", t->CurrentTestToString().c_str());
- } else {
- char buf[256];
- ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
- printf("%sResult = F (%s)\r\n\r\n", t->CurrentTestToString().c_str(), buf);
- }
- ERR_clear_error();
- return true;
-}
-
-int cavp_ecdsa2_pkv_test_main(int argc, char **argv) {
- if (argc != 2) {
- fprintf(stderr, "usage: %s <test file>\n",
- argv[0]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestECDSA2PKV;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc
deleted file mode 100644
index 1282eaa..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_siggen_test processes NIST CAVP ECDSA2 SigGen and
-// SigGenComponent test vector request files and emits the corresponding
-// response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ec_key.h>
-#include <openssl/ecdsa.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2SigGenImpl(FileTest *t, bool is_component) {
- int nid = GetECGroupNIDFromInstruction(t);
- const EVP_MD *md = GetDigestFromInstruction(t);
- if (nid == NID_undef || md == nullptr) {
- return false;
- }
- bssl::UniquePtr<BIGNUM> qx(BN_new()), qy(BN_new());
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- std::vector<uint8_t> msg;
- if (!qx || !qy || !key ||
- !EC_KEY_generate_key_fips(key.get()) ||
- !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
- EC_KEY_get0_public_key(key.get()),
- qx.get(), qy.get(), nullptr) ||
- !t->GetBytes(&msg, "Msg")) {
- return false;
- }
-
- uint8_t digest[EVP_MAX_MD_SIZE];
- unsigned digest_len;
- if (is_component) {
- if (msg.size() != EVP_MD_size(md)) {
- t->PrintLine("Bad input length.");
- return false;
- }
- digest_len = EVP_MD_size(md);
- OPENSSL_memcpy(digest, msg.data(), msg.size());
- } else if (!EVP_Digest(msg.data(), msg.size(), digest, &digest_len, md,
- nullptr)) {
- return false;
- }
-
- bssl::UniquePtr<ECDSA_SIG> sig(ECDSA_do_sign(digest, digest_len, key.get()));
- if (!sig) {
- return false;
- }
-
- size_t degree_len =
- (EC_GROUP_get_degree(EC_KEY_get0_group(key.get())) + 7) / 8;
- size_t order_len =
- BN_num_bytes(EC_GROUP_get0_order(EC_KEY_get0_group(key.get())));
- std::vector<uint8_t> qx_bytes(degree_len), qy_bytes(degree_len);
- std::vector<uint8_t> r_bytes(order_len), s_bytes(order_len);
- if (!BN_bn2bin_padded(qx_bytes.data(), qx_bytes.size(), qx.get()) ||
- !BN_bn2bin_padded(qy_bytes.data(), qy_bytes.size(), qy.get()) ||
- !BN_bn2bin_padded(r_bytes.data(), r_bytes.size(), sig->r) ||
- !BN_bn2bin_padded(s_bytes.data(), s_bytes.size(), sig->s)) {
- return false;
- }
-
- printf("%sQx = %s\r\nQy = %s\r\nR = %s\r\nS = %s\r\n\r\n",
- t->CurrentTestToString().c_str(), EncodeHex(qx_bytes).c_str(),
- EncodeHex(qy_bytes).c_str(), EncodeHex(r_bytes).c_str(),
- EncodeHex(s_bytes).c_str());
- return true;
-}
-
-static bool TestECDSA2SigGen(FileTest *t, void *arg) {
- return TestECDSA2SigGenImpl(t, false);
-}
-
-static bool TestECDSA2SigGenComponent(FileTest *t, void *arg) {
- return TestECDSA2SigGenImpl(t, true);
-}
-
-int cavp_ecdsa2_siggen_test_main(int argc, char **argv) {
- if (argc != 3) {
- fprintf(stderr, "usage: %s (SigGen|SigGenComponent) <test file>\n",
- argv[0]);
- return 1;
- }
-
- static bool (*test_func)(FileTest *, void *);
- if (strcmp(argv[1], "SigGen") == 0) {
- test_func = TestECDSA2SigGen;
- } else if (strcmp(argv[1], "SigGenComponent") == 0) {
- test_func = TestECDSA2SigGenComponent;
- } else {
- fprintf(stderr, "Unknown test type: %s\n", argv[1]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.callback = test_func;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc
deleted file mode 100644
index f3fd4b1..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_sigver_test processes a NIST CAVP ECDSA2 SigVer test vector
-// request file and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ec_key.h>
-#include <openssl/ecdsa.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2SigVer(FileTest *t, void *arg) {
- int nid = GetECGroupNIDFromInstruction(t);
- const EVP_MD *md = GetDigestFromInstruction(t);
- if (nid == NID_undef || md == nullptr) {
- return false;
- }
- bssl::UniquePtr<ECDSA_SIG> sig(ECDSA_SIG_new());
- bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
- bssl::UniquePtr<BIGNUM> qx = GetBIGNUM(t, "Qx");
- bssl::UniquePtr<BIGNUM> qy = GetBIGNUM(t, "Qy");
- bssl::UniquePtr<BIGNUM> r = GetBIGNUM(t, "R");
- bssl::UniquePtr<BIGNUM> s = GetBIGNUM(t, "S");
- std::vector<uint8_t> msg;
- uint8_t digest[EVP_MAX_MD_SIZE];
- unsigned digest_len;
- if (!sig || !key || !qx || !qy || !r || !s ||
- !EC_KEY_set_public_key_affine_coordinates(key.get(), qx.get(),
- qy.get()) ||
- !t->GetBytes(&msg, "Msg") ||
- !EVP_Digest(msg.data(), msg.size(), digest, &digest_len, md, nullptr)) {
- return false;
- }
-
- BN_free(sig->r);
- sig->r = r.release();
- BN_free(sig->s);
- sig->s = s.release();
-
- if (ECDSA_do_verify(digest, digest_len, sig.get(), key.get())) {
- printf("%sResult = P\r\n\r\n", t->CurrentTestToString().c_str());
- } else {
- char buf[256];
- ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
- printf("%sResult = F (%s)\r\n\r\n", t->CurrentTestToString().c_str(), buf);
- }
- ERR_clear_error();
- return true;
-}
-
-int cavp_ecdsa2_sigver_test_main(int argc, char **argv) {
- if (argc != 2) {
- fprintf(stderr, "usage: %s <test file>\n",
- argv[0]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestECDSA2SigVer;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_hmac_test.cc b/src/util/fipstools/cavp/cavp_hmac_test.cc
deleted file mode 100644
index c88226a..0000000
--- a/src/util/fipstools/cavp/cavp_hmac_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_hmac_test processes a NIST CAVP HMAC test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/hmac.h>
-#include <openssl/span.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestHMAC(FileTest *t, void *arg) {
- std::string md_len_str;
- if (!t->GetInstruction(&md_len_str, "L")) {
- return false;
- }
- const size_t md_len = strtoul(md_len_str.c_str(), nullptr, 0);
-
- const EVP_MD *md;
- switch (md_len) {
- case 20:
- md = EVP_sha1();
- break;
- case 28:
- md = EVP_sha224();
- break;
- case 32:
- md = EVP_sha256();
- break;
- case 48:
- md = EVP_sha384();
- break;
- case 64:
- md = EVP_sha512();
- break;
- default:
- return false;
- }
-
- std::string count_str, k_len_str, t_len_str;
- std::vector<uint8_t> key, msg;
- if (!t->GetAttribute(&count_str, "Count") ||
- !t->GetAttribute(&k_len_str, "Klen") ||
- !t->GetAttribute(&t_len_str, "Tlen") ||
- !t->GetBytes(&key, "Key") ||
- !t->GetBytes(&msg, "Msg")) {
- return false;
- }
-
- size_t k_len = strtoul(k_len_str.c_str(), nullptr, 0);
- size_t t_len = strtoul(t_len_str.c_str(), nullptr, 0);
- if (key.size() < k_len) {
- return false;
- }
- unsigned out_len;
- uint8_t out[EVP_MAX_MD_SIZE];
- if (HMAC(md, key.data(), k_len, msg.data(), msg.size(), out, &out_len) ==
- NULL) {
- return false;
- }
-
- if (out_len < t_len) {
- return false;
- }
-
- printf("%s", t->CurrentTestToString().c_str());
- printf("Mac = %s\r\n\r\n",
- EncodeHex(bssl::MakeConstSpan(out, t_len)).c_str());
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s <test file>\n", arg);
- return 1;
-}
-
-int cavp_hmac_test_main(int argc, char **argv) {
- if (argc != 2) {
- return usage(argv[0]);
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestHMAC;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_kas_test.cc b/src/util/fipstools/cavp/cavp_kas_test.cc
deleted file mode 100644
index 9a74f1d..0000000
--- a/src/util/fipstools/cavp/cavp_kas_test.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2018, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_kas_test processes NIST CAVP ECC KAS test vector request files and
-// emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ecdh.h>
-#include <openssl/ecdsa.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-#include <openssl/sha.h>
-#include <openssl/span.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestKAS(FileTest *t, void *arg) {
- const bool validate = *reinterpret_cast<bool *>(arg);
-
- int nid = NID_undef;
- size_t digest_len = 0;
-
- if (t->HasInstruction("EB - SHA224")) {
- nid = NID_secp224r1;
- digest_len = SHA224_DIGEST_LENGTH;
- } else if (t->HasInstruction("EC - SHA256")) {
- nid = NID_X9_62_prime256v1;
- digest_len = SHA256_DIGEST_LENGTH;
- } else if (t->HasInstruction("ED - SHA384")) {
- nid = NID_secp384r1;
- digest_len = SHA384_DIGEST_LENGTH;
- } else if (t->HasInstruction("EE - SHA512")) {
- nid = NID_secp521r1;
- digest_len = SHA512_DIGEST_LENGTH;
- } else {
- return false;
- }
-
- if (!t->HasAttribute("COUNT")) {
- return false;
- }
-
- bssl::UniquePtr<BIGNUM> their_x(GetBIGNUM(t, "QeCAVSx"));
- bssl::UniquePtr<BIGNUM> their_y(GetBIGNUM(t, "QeCAVSy"));
- bssl::UniquePtr<EC_KEY> ec_key(EC_KEY_new_by_curve_name(nid));
- bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
- if (!their_x || !their_y || !ec_key || !ctx) {
- return false;
- }
-
- const EC_GROUP *const group = EC_KEY_get0_group(ec_key.get());
- bssl::UniquePtr<EC_POINT> their_point(EC_POINT_new(group));
- if (!their_point ||
- !EC_POINT_set_affine_coordinates_GFp(
- group, their_point.get(), their_x.get(), their_y.get(), ctx.get())) {
- return false;
- }
-
- if (validate) {
- bssl::UniquePtr<BIGNUM> our_k(GetBIGNUM(t, "deIUT"));
- if (!our_k ||
- !EC_KEY_set_private_key(ec_key.get(), our_k.get()) ||
- // These attributes are ignored.
- !t->HasAttribute("QeIUTx") ||
- !t->HasAttribute("QeIUTy")) {
- return false;
- }
- } else if (!EC_KEY_generate_key(ec_key.get())) {
- return false;
- }
-
- uint8_t digest[EVP_MAX_MD_SIZE];
- if (!ECDH_compute_key_fips(digest, digest_len, their_point.get(),
- ec_key.get())) {
- return false;
- }
-
- if (validate) {
- std::vector<uint8_t> expected_shared_bytes;
- if (!t->GetBytes(&expected_shared_bytes, "CAVSHashZZ")) {
- return false;
- }
- const bool ok =
- digest_len == expected_shared_bytes.size() &&
- OPENSSL_memcmp(digest, expected_shared_bytes.data(), digest_len) == 0;
-
- printf("%sIUTHashZZ = %s\r\nResult = %c\r\n\r\n\r\n",
- t->CurrentTestToString().c_str(),
- EncodeHex(bssl::MakeConstSpan(digest, digest_len)).c_str(),
- ok ? 'P' : 'F');
- } else {
- const EC_POINT *pub = EC_KEY_get0_public_key(ec_key.get());
- bssl::UniquePtr<BIGNUM> x(BN_new());
- bssl::UniquePtr<BIGNUM> y(BN_new());
- if (!x || !y ||
- !EC_POINT_get_affine_coordinates_GFp(group, pub, x.get(), y.get(),
- ctx.get())) {
- return false;
- }
- bssl::UniquePtr<char> x_hex(BN_bn2hex(x.get()));
- bssl::UniquePtr<char> y_hex(BN_bn2hex(y.get()));
-
- printf("%sQeIUTx = %s\r\nQeIUTy = %s\r\nHashZZ = %s\r\n",
- t->CurrentTestToString().c_str(), x_hex.get(), y_hex.get(),
- EncodeHex(bssl::MakeConstSpan(digest, digest_len)).c_str());
- }
-
- return true;
-}
-
-int cavp_kas_test_main(int argc, char **argv) {
- if (argc != 3) {
- fprintf(stderr, "usage: %s (validity|function) <test file>\n",
- argv[0]);
- return 1;
- }
-
- bool validity;
- if (strcmp(argv[1], "validity") == 0) {
- validity = true;
- } else if (strcmp(argv[1], "function") == 0) {
- validity = false;
- } else {
- fprintf(stderr, "Unknown test type: %s\n", argv[1]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.arg = &validity;
- opts.callback = TestKAS;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- opts.is_kas_test = true;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_keywrap_test.cc b/src/util/fipstools/cavp/cavp_keywrap_test.cc
deleted file mode 100644
index 67397ec..0000000
--- a/src/util/fipstools/cavp/cavp_keywrap_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_keywrap_test processes a NIST CAVP AES test vector request file and
-// emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/aes.h>
-#include <openssl/crypto.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- bool encrypt;
- bool padding;
-};
-
-} // namespace
-
-static bool AESKeyWrap(std::vector<uint8_t> *out, bool encrypt,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &in) {
- size_t key_bits = key.size() * 8;
- if (key_bits != 128 && key_bits != 192 && key_bits != 256) {
- return false;
- }
- AES_KEY aes_key;
-
- if (encrypt) {
- out->resize(in.size() + 8);
- if (AES_set_encrypt_key(key.data(), key_bits, &aes_key) ||
- AES_wrap_key(&aes_key, nullptr, out->data(), in.data(), in.size()) ==
- -1) {
- return false;
- }
- } else {
- out->resize(in.size() - 8);
- if (AES_set_decrypt_key(key.data(), key_bits, &aes_key) ||
- AES_unwrap_key(&aes_key, nullptr, out->data(), in.data(), in.size()) ==
- -1) {
- return false;
- }
- }
-
- return true;
-}
-
-static bool AESKeyWrapWithPadding(std::vector<uint8_t> *out, bool encrypt,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &in) {
- const size_t key_bits = key.size() * 8;
- if (key_bits != 128 && key_bits != 192 && key_bits != 256) {
- return false;
- }
- AES_KEY aes_key;
-
- size_t out_len;
- if (encrypt) {
- out->resize(in.size() + 15);
- if (AES_set_encrypt_key(key.data(), key_bits, &aes_key) ||
- !AES_wrap_key_padded(&aes_key, out->data(), &out_len, out->size(),
- in.data(), in.size())) {
- return false;
- }
- } else {
- out->resize(in.size());
- if (AES_set_decrypt_key(key.data(), key_bits, &aes_key) ||
- !AES_unwrap_key_padded(&aes_key, out->data(), &out_len, out->size(),
- in.data(), in.size())) {
- return false;
- }
- }
-
- out->resize(out_len);
- return true;
-}
-
-static bool TestCipher(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- std::string count, unused, in_label = ctx->encrypt ? "P" : "C",
- result_label = ctx->encrypt ? "C" : "P";
- std::vector<uint8_t> key, in, result;
- // clang-format off
- if (!t->GetInstruction(&unused, "PLAINTEXT LENGTH") ||
- !t->GetAttribute(&count, "COUNT") ||
- !t->GetBytes(&key, "K") ||
- !t->GetBytes(&in, in_label)) {
- return false;
- }
- // clang-format on
-
- auto wrap_function = AESKeyWrap;
- if (ctx->padding) {
- wrap_function = AESKeyWrapWithPadding;
- }
-
- printf("%s", t->CurrentTestToString().c_str());
- if (!wrap_function(&result, ctx->encrypt, key, in)) {
- if (ctx->encrypt) {
- return false;
- } else {
- printf("FAIL\r\n\r\n");
- }
- } else {
- printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
- }
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(
- stderr,
- "usage: %s (enc|dec|enc-pad|dec-pad) (128|192|256) <test file>\n",
- arg);
- return 1;
-}
-
-int cavp_keywrap_test_main(int argc, char **argv) {
- if (argc != 4) {
- return usage(argv[0]);
- }
-
- const std::string op(argv[1]);
- bool encrypt = false;
- bool padding = false;
- if (op == "enc") {
- encrypt = true;
- } else if (op == "dec") {
- } else if (op == "enc-pad") {
- encrypt = true;
- padding = true;
- } else if (op == "dec-pad") {
- padding = true;
- } else {
- return usage(argv[0]);
- }
-
- TestCtx ctx = {encrypt, padding};
-
- FileTest::Options opts;
- opts.path = argv[3];
- opts.callback = TestCipher;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_main.cc b/src/util/fipstools/cavp/cavp_main.cc
deleted file mode 100644
index 64dbd69..0000000
--- a/src/util/fipstools/cavp/cavp_main.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_main is a wrapper that invokes the main entry function of one of the
-// CAVP validation suite binaries.
-
-#include <stdlib.h>
-#include <cstdio>
-#include <string>
-
-#include <openssl/crypto.h>
-
-#include "cavp_test_util.h"
-
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s <validation suite> <args ...>\n", arg);
- return 1;
-}
-
-struct TestSuite {
- std::string name;
- int (*main_func)(int argc, char **argv);
-};
-
-static TestSuite all_test_suites[] = {
- {"aes", &cavp_aes_test_main},
- {"aes_gcm", &cavp_aes_gcm_test_main},
- {"ctr_drbg", &cavp_ctr_drbg_test_main},
- {"ecdsa2_keypair", &cavp_ecdsa2_keypair_test_main},
- {"ecdsa2_pkv", &cavp_ecdsa2_pkv_test_main},
- {"ecdsa2_siggen", &cavp_ecdsa2_siggen_test_main},
- {"ecdsa2_sigver", &cavp_ecdsa2_sigver_test_main},
- {"hmac", &cavp_hmac_test_main},
- {"kas", &cavp_kas_test_main},
- {"keywrap", &cavp_keywrap_test_main},
- {"rsa2_keygen", &cavp_rsa2_keygen_test_main},
- {"rsa2_siggen", &cavp_rsa2_siggen_test_main},
- {"rsa2_sigver", &cavp_rsa2_sigver_test_main},
- {"tlskdf", &cavp_tlskdf_test_main},
- {"sha", &cavp_sha_test_main},
- {"sha_monte", &cavp_sha_monte_test_main},
- {"tdes", &cavp_tdes_test_main}
-};
-
-int main(int argc, char **argv) {
- CRYPTO_library_init();
-
- if (argc < 3) {
- return usage(argv[0]);
- }
-
- const std::string suite(argv[1]);
- for (const TestSuite &s : all_test_suites) {
- if (s.name == suite) {
- return s.main_func(argc - 1, &argv[1]);
- }
- }
-
- fprintf(stderr, "invalid test suite: %s\n\n", argv[1]);
- return usage(argv[0]);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc b/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc
deleted file mode 100644
index e7088c7..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_keygen_test processes NIST CAVP RSA2 KeyGen test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestRSA2KeyGen(FileTest *t, void *arg) {
- std::string mod_str, table, count_str;
- if (!t->GetInstruction(&mod_str, "mod") ||
- !t->GetInstruction(&table, "Table for M-R Test") ||
- table != "C.2" ||
- !t->GetAttribute(&count_str, "N")) {
- return false;
- }
-
- printf("[mod = %s]\r\n", mod_str.c_str());
- printf("[Table for M-R Test = %s]\r\n\r\n", table.c_str());
-
- size_t bits = strtoul(mod_str.c_str(), nullptr, 0);
- size_t count = strtoul(count_str.c_str(), nullptr, 0);
- for (size_t i = 0; i < count; i++) {
- bssl::UniquePtr<RSA> key(RSA_new());
- if (key == nullptr ||
- bits == 0 ||
- !RSA_generate_key_fips(key.get(), bits, nullptr)) {
- return 0;
- }
-
- const BIGNUM *n, *e, *d, *p, *q;
- RSA_get0_key(key.get(), &n, &e, &d);
- RSA_get0_factors(key.get(), &p, &q);
- std::vector<uint8_t> n_bytes(BN_num_bytes(n)), e_bytes(BN_num_bytes(e)),
- d_bytes((bits + 7) / 8), p_bytes(BN_num_bytes(p)),
- q_bytes(BN_num_bytes(q));
- if (n == NULL ||
- BN_bn2bin(n, n_bytes.data()) != n_bytes.size() ||
- e == NULL ||
- BN_bn2bin(e, e_bytes.data()) != e_bytes.size() ||
- d == NULL ||
- !BN_bn2bin_padded(d_bytes.data(), d_bytes.size(), d) ||
- p == NULL ||
- BN_bn2bin(p, p_bytes.data()) != p_bytes.size() ||
- q == NULL ||
- BN_bn2bin(q, q_bytes.data()) != q_bytes.size()) {
- return false;
- }
-
- printf("e = %s\r\np = %s\r\nq = %s\r\nn = %s\r\nd = %s\r\n\r\n",
- EncodeHex(e_bytes).c_str(), EncodeHex(p_bytes).c_str(),
- EncodeHex(q_bytes).c_str(), EncodeHex(n_bytes).c_str(),
- EncodeHex(d_bytes).c_str());
- }
-
- return true;
-}
-
-int cavp_rsa2_keygen_test_main(int argc, char **argv) {
- if (argc != 2) {
- fprintf(stderr, "usage: %s <test file>\n",
- argv[0]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestRSA2KeyGen;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc b/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc
deleted file mode 100644
index 636a73a..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_siggen_test processes NIST CAVP RSA2 SigGen test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-namespace {
-
-struct TestCtx {
- bssl::UniquePtr<RSA> key;
- bool is_pss;
-};
-
-}
-
-static bool TestRSA2SigGen(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- std::string mod_str, hash;
- std::vector<uint8_t> msg;
- if (!t->GetInstruction(&mod_str, "mod") ||
- !t->GetAttribute(&hash, "SHAAlg") ||
- !t->GetBytes(&msg, "Msg")) {
- return false;
- }
-
- std::string test = t->CurrentTestToString();
- if (t->IsAtNewInstructionBlock()) {
- int mod_bits = strtoul(mod_str.c_str(), nullptr, 0);
- ctx->key = bssl::UniquePtr<RSA>(RSA_new());
- if (ctx->key == nullptr ||
- mod_bits == 0 ||
- !RSA_generate_key_fips(ctx->key.get(), mod_bits, nullptr)) {
- return false;
- }
-
- const BIGNUM *n, *e;
- RSA_get0_key(ctx->key.get(), &n, &e, nullptr);
-
- std::vector<uint8_t> n_bytes(BN_num_bytes(n));
- std::vector<uint8_t> e_bytes(BN_num_bytes(e));
- if (!BN_bn2bin_padded(n_bytes.data(), n_bytes.size(), n) ||
- !BN_bn2bin_padded(e_bytes.data(), e_bytes.size(), e)) {
- return false;
- }
-
- printf("[mod = %s]\r\n\r\nn = %s\r\n\r\ne = %s", mod_str.c_str(),
- EncodeHex(n_bytes).c_str(), EncodeHex(e_bytes).c_str());
- test = test.substr(test.find("]") + 3);
- }
-
- const EVP_MD *md = EVP_get_digestbyname(hash.c_str());
- uint8_t digest_buf[EVP_MAX_MD_SIZE];
- std::vector<uint8_t> sig(RSA_size(ctx->key.get()));
- unsigned digest_len;
- size_t sig_len;
- if (md == NULL ||
- !EVP_Digest(msg.data(), msg.size(), digest_buf, &digest_len, md, NULL)) {
- return false;
- }
-
- if (ctx->is_pss) {
- if (!RSA_sign_pss_mgf1(ctx->key.get(), &sig_len, sig.data(), sig.size(),
- digest_buf, digest_len, md, md, -1)) {
- return false;
- }
- } else {
- unsigned sig_len_u;
- if (!RSA_sign(EVP_MD_type(md), digest_buf, digest_len, sig.data(),
- &sig_len_u, ctx->key.get())) {
- return false;
- }
- sig_len = sig_len_u;
- }
-
- sig.resize(sig_len);
- printf("%sS = %s\r\n\r\n", test.c_str(), EncodeHex(sig).c_str());
- return true;
-}
-
-int cavp_rsa2_siggen_test_main(int argc, char **argv) {
- if (argc != 3) {
- fprintf(stderr, "usage: %s (pkcs15|pss) <test file>\n",
- argv[0]);
- return 1;
- }
-
- TestCtx ctx;
- if (strcmp(argv[1], "pkcs15") == 0) {
- ctx = {nullptr, false};
- } else if (strcmp(argv[1], "pss") == 0) {
- ctx = {nullptr, true};
- } else {
- fprintf(stderr, "Unknown test type: %s\n", argv[1]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.callback = TestRSA2SigGen;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc b/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc
deleted file mode 100644
index cbcfc1f..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_sigver_test processes NIST CAVP RSA2 SigVer test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/err.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- std::vector<uint8_t> N;
- bool is_pss;
-};
-
-}
-
-static bool TestRSA2SigVer(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- std::string mod_str;
- if (!t->GetInstruction(&mod_str, "mod")) {
- return false;
- }
-
- printf("%s", t->CurrentTestToString().c_str());
-
- if (t->HasAttribute("n")) {
- printf("\r\n");
- return t->GetBytes(&ctx->N, "n");
- }
-
- std::string hash;
- std::vector<uint8_t> e_bytes, msg, sig;
- if (!t->GetAttribute(&hash, "SHAAlg") ||
- !t->GetBytes(&e_bytes, "e") ||
- !t->GetBytes(&msg, "Msg") ||
- !t->GetBytes(&sig, "S")) {
- return false;
- }
-
- bssl::UniquePtr<RSA> key(RSA_new());
- key->n = BN_new();
- key->e = BN_new();
- if (key == nullptr ||
- !BN_bin2bn(ctx->N.data(), ctx->N.size(), key->n) ||
- !BN_bin2bn(e_bytes.data(), e_bytes.size(), key->e)) {
- return false;
- }
-
- const EVP_MD *md = EVP_get_digestbyname(hash.c_str());
- uint8_t digest_buf[EVP_MAX_MD_SIZE];
- unsigned digest_len;
- if (md == NULL ||
- !EVP_Digest(msg.data(), msg.size(), digest_buf, &digest_len, md, NULL)) {
- return false;
- }
-
- int ok;
- if (ctx->is_pss) {
- ok = RSA_verify_pss_mgf1(key.get(), digest_buf, digest_len, md, md, -1,
- sig.data(), sig.size());
- } else {
- ok = RSA_verify(EVP_MD_type(md), digest_buf, digest_len, sig.data(),
- sig.size(), key.get());
- }
-
- if (ok) {
- printf("Result = P\r\n\r\n");
- } else {
- char buf[256];
- ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
- printf("Result = F (%s)\r\n\r\n", buf);
- }
- ERR_clear_error();
- return true;
-}
-
-int cavp_rsa2_sigver_test_main(int argc, char **argv) {
- if (argc != 3) {
- fprintf(stderr, "usage: %s (pkcs15|pss) <test file>\n",
- argv[0]);
- return 1;
- }
-
- TestCtx ctx;
- if (strcmp(argv[1], "pkcs15") == 0) {
- ctx = {std::vector<uint8_t>(), false};
- } else if (strcmp(argv[1], "pss") == 0) {
- ctx = {std::vector<uint8_t>(), true};
- } else {
- fprintf(stderr, "Unknown test type: %s\n", argv[1]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.callback = TestRSA2SigVer;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_sha_monte_test.cc b/src/util/fipstools/cavp/cavp_sha_monte_test.cc
deleted file mode 100644
index f5bcdd1..0000000
--- a/src/util/fipstools/cavp/cavp_sha_monte_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_sha_monte_test processes a NIST CAVP SHA-Monte test vector request file
-// and emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- std::string hash;
-};
-
-}
-
-static bool TestSHAMonte(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- const EVP_MD *md = EVP_get_digestbyname(ctx->hash.c_str());
- if (md == nullptr) {
- return false;
- }
- const size_t md_len = EVP_MD_size(md);
-
- std::string out_len;
- if (!t->GetInstruction(&out_len, "L") ||
- md_len != strtoul(out_len.c_str(), nullptr, 0)) {
- return false;
- }
-
- std::vector<uint8_t> seed;
- if (!t->GetBytes(&seed, "Seed") ||
- seed.size() != md_len) {
- return false;
- }
-
- std::vector<uint8_t> out = seed;
-
- printf("%s\r\n", t->CurrentTestToString().c_str());
-
- for (int count = 0; count < 100; count++) {
- std::vector<uint8_t> msg;
- msg.insert(msg.end(), out.begin(), out.end());
- msg.insert(msg.end(), out.begin(), out.end());
- msg.insert(msg.end(), out.begin(), out.end());
- for (int i = 0; i < 1000; i++) {
- unsigned digest_len;
- if (!EVP_Digest(msg.data(), msg.size(), out.data(), &digest_len, md,
- nullptr) ||
- digest_len != out.size()) {
- return false;
- }
-
- msg.erase(msg.begin(), msg.begin() + out.size());
- msg.insert(msg.end(), out.begin(), out.end());
- }
- printf("COUNT = %d\r\n", count);
- printf("MD = %s\r\n\r\n", EncodeHex(out).c_str());
- }
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s <hash> <test file>\n", arg);
- return 1;
-}
-
-int cavp_sha_monte_test_main(int argc, char **argv) {
- if (argc != 3) {
- return usage(argv[0]);
- }
-
- TestCtx ctx = {std::string(argv[1])};
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.callback = TestSHAMonte;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_sha_test.cc b/src/util/fipstools/cavp/cavp_sha_test.cc
deleted file mode 100644
index c046451..0000000
--- a/src/util/fipstools/cavp/cavp_sha_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_sha_test processes a NIST CAVP SHA test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-namespace {
-
-struct TestCtx {
- std::string hash;
-};
-
-}
-
-static bool TestSHA(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- const EVP_MD *md = EVP_get_digestbyname(ctx->hash.c_str());
- if (md == nullptr) {
- return false;
- }
- const size_t md_len = EVP_MD_size(md);
-
- std::string out_len;
- if (!t->GetInstruction(&out_len, "L") ||
- md_len != strtoul(out_len.c_str(), nullptr, 0)) {
- return false;
- }
-
- std::string msg_len_str;
- std::vector<uint8_t> msg;
- if (!t->GetAttribute(&msg_len_str, "Len") ||
- !t->GetBytes(&msg, "Msg")) {
- return false;
- }
-
- size_t msg_len = strtoul(msg_len_str.c_str(), nullptr, 0);
- if (msg_len % 8 != 0 ||
- msg_len / 8 > msg.size()) {
- return false;
- }
- msg_len /= 8;
-
- std::vector<uint8_t> out;
- out.resize(md_len);
- unsigned digest_len;
- if (!EVP_Digest(msg.data(), msg_len, out.data(), &digest_len, md, nullptr) ||
- digest_len != out.size()) {
- return false;
- }
-
- printf("%s", t->CurrentTestToString().c_str());
- printf("MD = %s\r\n\r\n", EncodeHex(out).c_str());
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s <hash> <test file>\n", arg);
- return 1;
-}
-
-int cavp_sha_test_main(int argc, char **argv) {
- if (argc != 3) {
- return usage(argv[0]);
- }
-
- TestCtx ctx = {std::string(argv[1])};
-
- FileTest::Options opts;
- opts.path = argv[2];
- opts.callback = TestSHA;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_tdes_test.cc b/src/util/fipstools/cavp/cavp_tdes_test.cc
deleted file mode 100644
index 7b8839d..0000000
--- a/src/util/fipstools/cavp/cavp_tdes_test.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_tdes_test processes a NIST TMOVS test vector request file and emits the
-// corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
- const EVP_CIPHER *cipher;
- enum Mode {
- kKAT, // Known Answer Test
- kMCT, // Monte Carlo Test
- };
- bool has_iv;
- Mode mode;
-};
-
-}
-
-static bool TestKAT(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
- t->PrintLine("Want either ENCRYPT or DECRYPT");
- return false;
- }
- enum {
- kEncrypt,
- kDecrypt,
- } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
- if (t->HasAttribute("NumKeys")) {
- // Another file format quirk: NumKeys is a single attribute line immediately
- // following an instruction and should probably have been an instruction
- // instead. If it is present, the file has separate attributes "KEY{1,2,3}".
- // If it is not, the keys are concatenated in a single attribute "KEYs".
- std::string num_keys;
- t->GetAttribute(&num_keys, "NumKeys");
- t->InjectInstruction("NumKeys", num_keys);
-
- std::string header = operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
- printf("%s\r\n\r\n", header.c_str());
-
- return true;
- }
-
- enum {
- kNotPresent,
- kTwo,
- kThree,
- } num_keys = kNotPresent;
- if (t->HasInstruction("NumKeys")) {
- std::string num_keys_str;
- t->GetInstruction(&num_keys_str, "NumKeys");
- const int n = strtoul(num_keys_str.c_str(), nullptr, 0);
- if (n == 2) {
- num_keys = kTwo;
- } else if (n == 3) {
- num_keys = kThree;
- } else {
- t->PrintLine("invalid NumKeys value");
- return false;
- }
- }
-
- std::string count;
- std::vector<uint8_t> keys, key1, key2, key3, iv, in, result;
- const std::string in_label =
- operation == kEncrypt ? "PLAINTEXT" : "CIPHERTEXT";
- // clang-format off
- if (!t->GetAttribute(&count, "COUNT") ||
- (num_keys == 0 && !t->GetBytes(&keys, "KEYs")) ||
- (num_keys > 0 &&
- (!t->GetBytes(&key1, "KEY1") ||
- !t->GetBytes(&key2, "KEY2") ||
- !t->GetBytes(&key3, "KEY3"))) ||
- (ctx->has_iv && !t->GetBytes(&iv, "IV")) ||
- !t->GetBytes(&in, in_label)) {
- return false;
- }
- // clang-format on
- std::vector<uint8_t> key;
- if (num_keys != kNotPresent) {
- key.insert(key.end(), key1.begin(), key1.end());
- key.insert(key.end(), key2.begin(), key2.end());
- if (num_keys == kThree) {
- key.insert(key.end(), key3.begin(), key3.end());
- }
- } else {
- key.insert(key.end(), keys.begin(), keys.end());
- key.insert(key.end(), keys.begin(), keys.end());
- key.insert(key.end(), keys.begin(), keys.end());
- }
-
- if (!CipherOperation(ctx->cipher, &result, operation == kEncrypt, key, iv,
- in)) {
- return false;
- }
-
- // TDES fax files output format differs from file to file, and the input
- // format is inconsistent with the output, so we construct the output manually
- // rather than printing CurrentTestToString().
- if (t->IsAtNewInstructionBlock() && num_keys == kNotPresent) {
- // If NumKeys is present, header is printed when parsing NumKeys.
- std::string header = operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
- printf("%s\r\n", header.c_str());
- }
- const std::string result_label =
- operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
- printf("COUNT = %s\r\n", count.c_str());
- if (num_keys == kNotPresent) {
- printf("KEYs = %s\r\n", EncodeHex(keys).c_str());
- } else {
- printf("KEY1 = %s\r\nKEY2 = %s\r\nKEY3 = %s\r\n", EncodeHex(key1).c_str(),
- EncodeHex(key2).c_str(), EncodeHex(key3).c_str());
- }
- if (ctx->has_iv) {
- printf("IV = %s\r\n", EncodeHex(iv).c_str());
- }
- printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
- printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
- return true;
-}
-
-// XORKeyWithOddParityLSB sets |*key| to |key| XOR |value| and then writes
-// the LSB of each byte to establish odd parity for that byte. This parity-based
-// embedded of a DES key into 64 bits is an old tradition and something that
-// NIST's tests require.
-static void XORKeyWithOddParityLSB(std::vector<uint8_t> *key,
- const std::vector<uint8_t> &value) {
- for (size_t i = 0; i < key->size(); i++) {
- uint8_t v = (*key)[i] ^ value[i];
-
- // Use LSB to establish odd parity.
- v |= 0x01;
- for (uint8_t j = 1; j < 8; j++) {
- v ^= ((v >> j) & 0x01);
- }
- (*key)[i] = v;
- }
-}
-
-static bool TestMCT(FileTest *t, void *arg) {
- TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
- if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
- t->PrintLine("Want either ENCRYPT or DECRYPT");
- return false;
- }
- enum {
- kEncrypt,
- kDecrypt,
- } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
- if (t->HasAttribute("NumKeys")) {
- // Another file format quirk: NumKeys is a single attribute line immediately
- // following an instruction and should probably have been an instruction
- // instead.
- std::string num_keys;
- t->GetAttribute(&num_keys, "NumKeys");
- t->InjectInstruction("NumKeys", num_keys);
- return true;
- }
-
- enum {
- kTwo,
- kThree,
- } num_keys;
- std::string num_keys_str;
- if (!t->GetInstruction(&num_keys_str, "NumKeys")) {
- return false;
- } else {
- const int n = strtoul(num_keys_str.c_str(), nullptr, 0);
- if (n == 2) {
- num_keys = kTwo;
- } else if (n == 3) {
- num_keys = kThree;
- } else {
- t->PrintLine("invalid NumKeys value");
- return false;
- }
- }
-
- std::string count;
- std::vector<uint8_t> key1, key2, key3, iv, in, result;
- const std::string in_label =
- operation == kEncrypt ? "PLAINTEXT" : "CIPHERTEXT";
- // clang-format off
- if (!t->GetBytes(&key1, "KEY1") ||
- !t->GetBytes(&key2, "KEY2") ||
- !t->GetBytes(&key3, "KEY3") ||
- (ctx->has_iv && !t->GetBytes(&iv, "IV")) ||
- !t->GetBytes(&in, in_label)) {
- return false;
- }
- // clang-format on
-
- for (int i = 0; i < 400; i++) {
- std::vector<uint8_t> current_iv = iv, current_in = in, prev_result,
- prev_prev_result;
-
- std::vector<uint8_t> key(key1);
- key.insert(key.end(), key2.begin(), key2.end());
- key.insert(key.end(), key3.begin(), key3.end());
-
- for (int j = 0; j < 10000; j++) {
- prev_prev_result = prev_result;
- prev_result = result;
- const EVP_CIPHER *cipher = ctx->cipher;
- if (!CipherOperation(cipher, &result, operation == kEncrypt, key,
- current_iv, current_in)) {
- t->PrintLine("CipherOperation failed");
- return false;
- }
- if (ctx->has_iv) {
- if (operation == kEncrypt) {
- if (j == 0) {
- current_in = current_iv;
- } else {
- current_in = prev_result;
- }
- current_iv = result;
- } else { // operation == kDecrypt
- current_iv = current_in;
- current_in = result;
- }
- } else {
- current_in = result;
- }
- }
-
- // Output result for COUNT = i.
- const std::string result_label =
- operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
- if (i == 0) {
- const std::string op_label =
- operation == kEncrypt ? "ENCRYPT" : "DECRYPT";
- printf("[%s]\n\n", op_label.c_str());
- }
- printf("COUNT = %d\r\nKEY1 = %s\r\nKEY2 = %s\r\nKEY3 = %s\r\n", i,
- EncodeHex(key1).c_str(), EncodeHex(key2).c_str(),
- EncodeHex(key3).c_str());
- if (ctx->has_iv) {
- printf("IV = %s\r\n", EncodeHex(iv).c_str());
- }
- printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
- printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
-
- XORKeyWithOddParityLSB(&key1, result);
- XORKeyWithOddParityLSB(&key2, prev_result);
- if (num_keys == kThree) {
- XORKeyWithOddParityLSB(&key3, prev_prev_result);
- } else {
- XORKeyWithOddParityLSB(&key3, result);
- }
-
- if (ctx->has_iv) {
- if (operation == kEncrypt) {
- in = prev_result;
- iv = result;
- } else {
- iv = current_iv;
- in = current_in;
- }
- } else {
- in = result;
- }
- }
-
- return true;
-}
-
-static int usage(char *arg) {
- fprintf(stderr, "usage: %s (kat|mct) <cipher> <test file>\n", arg);
- return 1;
-}
-
-int cavp_tdes_test_main(int argc, char **argv) {
- if (argc != 4) {
- return usage(argv[0]);
- }
-
- const std::string tm(argv[1]);
- enum TestCtx::Mode test_mode;
- if (tm == "kat") {
- test_mode = TestCtx::kKAT;
- } else if (tm == "mct") {
- test_mode = TestCtx::kMCT;
- } else {
- fprintf(stderr, "invalid test_mode: %s\n", tm.c_str());
- return usage(argv[0]);
- }
-
- const std::string cipher_name(argv[2]);
- const EVP_CIPHER *cipher = GetCipher(argv[2]);
- if (cipher == nullptr) {
- fprintf(stderr, "invalid cipher: %s\n", argv[2]);
- return 1;
- }
- bool has_iv = cipher_name != "des-ede" && cipher_name != "des-ede3";
- TestCtx ctx = {cipher, has_iv, test_mode};
-
- FileTestFunc test_fn = test_mode == TestCtx::kKAT ? &TestKAT : &TestMCT;
- FileTest::Options opts;
- opts.path = argv[3];
- opts.callback = test_fn;
- opts.arg = &ctx;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_test_util.cc b/src/util/fipstools/cavp/cavp_test_util.cc
deleted file mode 100644
index 1b4e3a1..0000000
--- a/src/util/fipstools/cavp/cavp_test_util.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#include "cavp_test_util.h"
-
-#include <openssl/bn.h>
-#include <openssl/digest.h>
-#include <openssl/ec.h>
-#include <openssl/nid.h>
-
-
-const EVP_CIPHER *GetCipher(const std::string &name) {
- if (name == "des-cbc") {
- return EVP_des_cbc();
- } else if (name == "des-ecb") {
- return EVP_des_ecb();
- } else if (name == "des-ede") {
- return EVP_des_ede();
- } else if (name == "des-ede3") {
- return EVP_des_ede3();
- } else if (name == "des-ede-cbc") {
- return EVP_des_ede_cbc();
- } else if (name == "des-ede3-cbc") {
- return EVP_des_ede3_cbc();
- } else if (name == "rc4") {
- return EVP_rc4();
- } else if (name == "aes-128-ecb") {
- return EVP_aes_128_ecb();
- } else if (name == "aes-256-ecb") {
- return EVP_aes_256_ecb();
- } else if (name == "aes-128-cbc") {
- return EVP_aes_128_cbc();
- } else if (name == "aes-128-gcm") {
- return EVP_aes_128_gcm();
- } else if (name == "aes-128-ofb") {
- return EVP_aes_128_ofb();
- } else if (name == "aes-192-cbc") {
- return EVP_aes_192_cbc();
- } else if (name == "aes-192-ctr") {
- return EVP_aes_192_ctr();
- } else if (name == "aes-192-ecb") {
- return EVP_aes_192_ecb();
- } else if (name == "aes-256-cbc") {
- return EVP_aes_256_cbc();
- } else if (name == "aes-128-ctr") {
- return EVP_aes_128_ctr();
- } else if (name == "aes-256-ctr") {
- return EVP_aes_256_ctr();
- } else if (name == "aes-256-gcm") {
- return EVP_aes_256_gcm();
- } else if (name == "aes-256-ofb") {
- return EVP_aes_256_ofb();
- }
- return nullptr;
-}
-
-bool CipherOperation(const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
- bool encrypt, const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &iv,
- const std::vector<uint8_t> &in) {
- bssl::ScopedEVP_CIPHER_CTX ctx;
- if (!EVP_CipherInit_ex(ctx.get(), cipher, nullptr, nullptr, nullptr,
- encrypt ? 1 : 0)) {
- return false;
- }
- if (!iv.empty() && iv.size() != EVP_CIPHER_CTX_iv_length(ctx.get())) {
- return false;
- }
-
- int result_len1 = 0, result_len2;
- *out = std::vector<uint8_t>(in.size());
- if (!EVP_CIPHER_CTX_set_key_length(ctx.get(), key.size()) ||
- !EVP_CipherInit_ex(ctx.get(), nullptr, nullptr, key.data(), iv.data(),
- -1) ||
- !EVP_CIPHER_CTX_set_padding(ctx.get(), 0) ||
- !EVP_CipherUpdate(ctx.get(), out->data(), &result_len1, in.data(),
- in.size()) ||
- !EVP_CipherFinal_ex(ctx.get(), out->data() + result_len1, &result_len2)) {
- return false;
- }
- out->resize(result_len1 + result_len2);
-
- return true;
-}
-
-bool AEADEncrypt(const EVP_AEAD *aead, std::vector<uint8_t> *ct,
- std::vector<uint8_t> *tag, size_t tag_len,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &pt,
- const std::vector<uint8_t> &aad,
- const std::vector<uint8_t> &iv) {
- bssl::ScopedEVP_AEAD_CTX ctx;
- if (!EVP_AEAD_CTX_init(ctx.get(), aead, key.data(), key.size(), tag_len,
- nullptr)) {
- return false;
- }
-
- std::vector<uint8_t> out;
- out.resize(pt.size() + EVP_AEAD_max_overhead(aead));
- size_t out_len;
- if (!EVP_AEAD_CTX_seal(ctx.get(), out.data(), &out_len, out.size(), iv.data(),
- iv.size(), pt.data(), pt.size(), aad.data(),
- aad.size())) {
- return false;
- }
- out.resize(out_len);
-
- ct->assign(out.begin(), out.end() - tag_len);
- tag->assign(out.end() - tag_len, out.end());
-
- return true;
-}
-
-bool AEADDecrypt(const EVP_AEAD *aead, std::vector<uint8_t> *pt, size_t pt_len,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &aad,
- const std::vector<uint8_t> &ct,
- const std::vector<uint8_t> &tag,
- const std::vector<uint8_t> &iv) {
- bssl::ScopedEVP_AEAD_CTX ctx;
- if (!EVP_AEAD_CTX_init_with_direction(ctx.get(), aead, key.data(), key.size(),
- tag.size(), evp_aead_open)) {
- return false;
- }
- std::vector<uint8_t> in = ct;
- in.reserve(ct.size() + tag.size());
- in.insert(in.end(), tag.begin(), tag.end());
-
- pt->resize(pt_len);
- size_t out_pt_len;
- if (!EVP_AEAD_CTX_open(ctx.get(), pt->data(), &out_pt_len, pt->size(),
- iv.data(), iv.size(), in.data(), in.size(), aad.data(),
- aad.size()) ||
- out_pt_len != pt_len) {
- return false;
- }
- return true;
-}
-
-static int HexToBIGNUM(bssl::UniquePtr<BIGNUM> *out, const char *in) {
- BIGNUM *raw = NULL;
- int ret = BN_hex2bn(&raw, in);
- out->reset(raw);
- return ret;
-}
-
-bssl::UniquePtr<BIGNUM> GetBIGNUM(FileTest *t, const char *attribute) {
- std::string hex;
- if (!t->GetAttribute(&hex, attribute)) {
- return nullptr;
- }
-
- bssl::UniquePtr<BIGNUM> ret;
- if (HexToBIGNUM(&ret, hex.c_str()) != static_cast<int>(hex.size())) {
- t->PrintLine("Could not decode '%s'.", hex.c_str());
- return nullptr;
- }
- return ret;
-}
-
-int GetECGroupNIDFromInstruction(FileTest *t, const char **out_str) {
- const char *dummy;
- if (out_str == nullptr) {
- out_str = &dummy;
- }
-
- if (t->HasInstruction("P-224")) {
- *out_str = "P-224";
- return NID_secp224r1;
- }
- if (t->HasInstruction("P-256")) {
- *out_str = "P-256";
- return NID_X9_62_prime256v1;
- }
- if (t->HasInstruction("P-384")) {
- *out_str = "P-384";
- return NID_secp384r1;
- }
- if (t->HasInstruction("P-521")) {
- *out_str = "P-521";
- return NID_secp521r1;
- }
- t->PrintLine("No supported group specified.");
- return NID_undef;
-}
-
-const EVP_MD *GetDigestFromInstruction(FileTest *t) {
- if (t->HasInstruction("SHA-1")) {
- return EVP_sha1();
- }
- if (t->HasInstruction("SHA-224")) {
- return EVP_sha224();
- }
- if (t->HasInstruction("SHA-256")) {
- return EVP_sha256();
- }
- if (t->HasInstruction("SHA-384")) {
- return EVP_sha384();
- }
- if (t->HasInstruction("SHA-512")) {
- return EVP_sha512();
- }
- t->PrintLine("No supported digest function specified.");
- return nullptr;
-}
-
-void EchoComment(const std::string& comment) {
- fwrite(comment.c_str(), comment.size(), 1, stdout);
-}
diff --git a/src/util/fipstools/cavp/cavp_test_util.h b/src/util/fipstools/cavp/cavp_test_util.h
deleted file mode 100644
index d51dfe6..0000000
--- a/src/util/fipstools/cavp/cavp_test_util.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
-#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
-
-#include <stdlib.h>
-#include <string>
-#include <vector>
-
-#include <openssl/aead.h>
-#include <openssl/cipher.h>
-
-#include "../crypto/test/file_test.h"
-
-
-const EVP_CIPHER *GetCipher(const std::string &name);
-
-bool CipherOperation(const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
- bool encrypt, const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &iv,
- const std::vector<uint8_t> &in);
-
-bool AEADEncrypt(const EVP_AEAD *aead, std::vector<uint8_t> *ct,
- std::vector<uint8_t> *tag, size_t tag_len,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &pt,
- const std::vector<uint8_t> &aad,
- const std::vector<uint8_t> &iv);
-
-bool AEADDecrypt(const EVP_AEAD *aead, std::vector<uint8_t> *pt, size_t pt_len,
- const std::vector<uint8_t> &key,
- const std::vector<uint8_t> &aad,
- const std::vector<uint8_t> &ct,
- const std::vector<uint8_t> &tag,
- const std::vector<uint8_t> &iv);
-
-bssl::UniquePtr<BIGNUM> GetBIGNUM(FileTest *t, const char *attribute);
-
-int GetECGroupNIDFromInstruction(FileTest *t, const char **out_str = nullptr);
-
-const EVP_MD *GetDigestFromInstruction(FileTest *t);
-
-void EchoComment(const std::string& comment);
-
-int cavp_aes_gcm_test_main(int argc, char **argv);
-int cavp_aes_test_main(int argc, char **argv);
-int cavp_ctr_drbg_test_main(int argc, char **argv);
-int cavp_ecdsa2_keypair_test_main(int argc, char **argv);
-int cavp_ecdsa2_pkv_test_main(int argc, char **argv);
-int cavp_ecdsa2_siggen_test_main(int argc, char **argv);
-int cavp_ecdsa2_sigver_test_main(int argc, char **argv);
-int cavp_hmac_test_main(int argc, char **argv);
-int cavp_kas_test_main(int argc, char **argv);
-int cavp_keywrap_test_main(int argc, char **argv);
-int cavp_rsa2_keygen_test_main(int argc, char **argv);
-int cavp_rsa2_siggen_test_main(int argc, char **argv);
-int cavp_rsa2_sigver_test_main(int argc, char **argv);
-int cavp_sha_monte_test_main(int argc, char **argv);
-int cavp_sha_test_main(int argc, char **argv);
-int cavp_tdes_test_main(int argc, char **argv);
-int cavp_tlskdf_test_main(int argc, char **argv);
-
-
-#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
diff --git a/src/util/fipstools/cavp/cavp_tlskdf_test.cc b/src/util/fipstools/cavp/cavp_tlskdf_test.cc
deleted file mode 100644
index 0243439..0000000
--- a/src/util/fipstools/cavp/cavp_tlskdf_test.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_tlskdf_test processes NIST TLS KDF test vectors and emits the
-// corresponding response.
-// See https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/components/askdfvs.pdf, section 6.4.
-
-#include <vector>
-
-#include <errno.h>
-
-#include <openssl/digest.h>
-
-#include "cavp_test_util.h"
-#include "../crypto/fipsmodule/tls/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-
-
-static bool TestTLSKDF(FileTest *t, void *arg) {
- const EVP_MD *md = nullptr;
-
- if (t->HasInstruction("TLS 1.0/1.1")) {
- md = EVP_md5_sha1();
- } else if (t->HasInstruction("TLS 1.2")) {
- if (t->HasInstruction("SHA-256")) {
- md = EVP_sha256();
- } else if (t->HasInstruction("SHA-384")) {
- md = EVP_sha384();
- } else if (t->HasInstruction("SHA-512")) {
- md = EVP_sha512();
- }
- }
-
- if (md == nullptr) {
- return false;
- }
-
- std::string key_block_len_str;
- std::vector<uint8_t> premaster, server_random, client_random,
- key_block_server_random, key_block_client_random;
- if (!t->GetBytes(&premaster, "pre_master_secret") ||
- !t->GetBytes(&server_random, "serverHello_random") ||
- !t->GetBytes(&client_random, "clientHello_random") ||
- // The NIST tests specify different client and server randoms for the
- // expansion step from the master-secret step. This is impossible in TLS.
- !t->GetBytes(&key_block_server_random, "server_random") ||
- !t->GetBytes(&key_block_client_random, "client_random") ||
- !t->GetInstruction(&key_block_len_str, "key block length") ||
- // These are ignored.
- !t->HasAttribute("COUNT") ||
- !t->HasInstruction("pre-master secret length")) {
- return false;
- }
-
- uint8_t master_secret[48];
- static const char kMasterSecretLabel[] = "master secret";
- if (!CRYPTO_tls1_prf(md, master_secret, sizeof(master_secret),
- premaster.data(), premaster.size(), kMasterSecretLabel,
- sizeof(kMasterSecretLabel) - 1, client_random.data(),
- client_random.size(), server_random.data(),
- server_random.size())) {
- return false;
- }
-
- errno = 0;
- const long int key_block_bits =
- strtol(key_block_len_str.c_str(), nullptr, 10);
- if (errno != 0 || key_block_bits <= 0 || (key_block_bits & 7) != 0) {
- return false;
- }
- const size_t key_block_len = key_block_bits / 8;
- std::vector<uint8_t> key_block(key_block_len);
- static const char kLabel[] = "key expansion";
- if (!CRYPTO_tls1_prf(
- md, key_block.data(), key_block.size(), master_secret,
- sizeof(master_secret), kLabel, sizeof(kLabel) - 1,
- key_block_server_random.data(), key_block_server_random.size(),
- key_block_client_random.data(), key_block_client_random.size())) {
- return false;
- }
-
- printf("%smaster_secret = %s\r\nkey_block = %s\r\n\r\n",
- t->CurrentTestToString().c_str(), EncodeHex(master_secret).c_str(),
- EncodeHex(key_block).c_str());
-
- return true;
-}
-
-int cavp_tlskdf_test_main(int argc, char **argv) {
- if (argc != 2) {
- fprintf(stderr, "usage: %s <test file>\n", argv[0]);
- return 1;
- }
-
- FileTest::Options opts;
- opts.path = argv[1];
- opts.callback = TestTLSKDF;
- opts.silent = true;
- opts.comment_callback = EchoComment;
- return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/run_cavp.go b/src/util/fipstools/cavp/run_cavp.go
deleted file mode 100644
index 51a4100..0000000
--- a/src/util/fipstools/cavp/run_cavp.go
+++ /dev/null
@@ -1,592 +0,0 @@
-// run_cavp.go processes CAVP input files and generates suitable response
-// files, optionally comparing the results against the provided FAX files.
-package main
-
-import (
- "bufio"
- "errors"
- "flag"
- "fmt"
- "os"
- "os/exec"
- "path"
- "path/filepath"
- "runtime"
- "strings"
- "sync"
- "time"
-)
-
-var (
- oraclePath = flag.String("oracle-bin", "", "Path to the oracle binary")
- suiteDir = flag.String("suite-dir", "", "Base directory containing the CAVP test suite")
- noFAX = flag.Bool("no-fax", false, "Skip comparing against FAX files")
- android = flag.Bool("android", false, "Run tests via ADB")
-)
-
-const (
- androidTmpPath = "/data/local/tmp/"
- androidCAVPPath = androidTmpPath + "cavp"
- androidLibCryptoPath = androidTmpPath + "libcrypto.so"
-)
-
-// test describes a single request file.
-type test struct {
- // inFile is the base of the filename without an extension, i.e.
- // “ECBMCT128”.
- inFile string
- // args are the arguments (not including the input filename) to the
- // oracle binary.
- args []string
- // noFAX, if true, indicates that the output cannot be compared against
- // the FAX file. (E.g. because the primitive is non-deterministic.)
- noFAX bool
-}
-
-// nextLineState can be used by FAX next-line function to store state.
-type nextLineState struct {
- // State used by the KAS test.
- nextIsIUTHash bool
-}
-
-// testSuite describes a series of tests that are handled by a single oracle
-// binary.
-type testSuite struct {
- // directory is the name of the directory in the CAVP input, i.e. “AES”.
- directory string
- // suite names the test suite to pass as the first command-line argument.
- suite string
- // nextLineFunc, if not nil, is the function used to read the next line
- // from the FAX file. This can be used to skip lines and/or mutate them
- // as needed. The second argument can be used by the scanner to store
- // state, if needed. If isWildcard is true on return then line is not
- // meaningful and any line from the response file should be accepted.
- nextLineFunc func(*bufio.Scanner, *nextLineState) (line string, isWildcard, ok bool)
- tests []test
-}
-
-func (t *testSuite) getDirectory() string {
- return filepath.Join(*suiteDir, t.directory)
-}
-
-var aesGCMTests = testSuite{
- "AES_GCM",
- "aes_gcm",
- nil,
- []test{
- {"gcmDecrypt128", []string{"dec", "aes-128-gcm"}, false},
- {"gcmDecrypt192", []string{"dec", "aes-192-gcm"}, false},
- {"gcmDecrypt256", []string{"dec", "aes-256-gcm"}, false},
- {"gcmEncryptExtIV128", []string{"enc", "aes-128-gcm"}, false},
- {"gcmEncryptExtIV192", []string{"enc", "aes-192-gcm"}, false},
- {"gcmEncryptExtIV256", []string{"enc", "aes-256-gcm"}, false},
- },
-}
-
-var aesTests = testSuite{
- "AES",
- "aes",
- nil,
- []test{
- {"CBCGFSbox128", []string{"kat", "aes-128-cbc"}, false},
- {"CBCGFSbox192", []string{"kat", "aes-192-cbc"}, false},
- {"CBCGFSbox256", []string{"kat", "aes-256-cbc"}, false},
- {"CBCKeySbox128", []string{"kat", "aes-128-cbc"}, false},
- {"CBCKeySbox192", []string{"kat", "aes-192-cbc"}, false},
- {"CBCKeySbox256", []string{"kat", "aes-256-cbc"}, false},
- {"CBCMMT128", []string{"kat", "aes-128-cbc"}, false},
- {"CBCMMT192", []string{"kat", "aes-192-cbc"}, false},
- {"CBCMMT256", []string{"kat", "aes-256-cbc"}, false},
- {"CBCVarKey128", []string{"kat", "aes-128-cbc"}, false},
- {"CBCVarKey192", []string{"kat", "aes-192-cbc"}, false},
- {"CBCVarKey256", []string{"kat", "aes-256-cbc"}, false},
- {"CBCVarTxt128", []string{"kat", "aes-128-cbc"}, false},
- {"CBCVarTxt192", []string{"kat", "aes-192-cbc"}, false},
- {"CBCVarTxt256", []string{"kat", "aes-256-cbc"}, false},
- {"ECBGFSbox128", []string{"kat", "aes-128-ecb"}, false},
- {"ECBGFSbox192", []string{"kat", "aes-192-ecb"}, false},
- {"ECBGFSbox256", []string{"kat", "aes-256-ecb"}, false},
- {"ECBKeySbox128", []string{"kat", "aes-128-ecb"}, false},
- {"ECBKeySbox192", []string{"kat", "aes-192-ecb"}, false},
- {"ECBKeySbox256", []string{"kat", "aes-256-ecb"}, false},
- {"ECBMMT128", []string{"kat", "aes-128-ecb"}, false},
- {"ECBMMT192", []string{"kat", "aes-192-ecb"}, false},
- {"ECBMMT256", []string{"kat", "aes-256-ecb"}, false},
- {"ECBVarKey128", []string{"kat", "aes-128-ecb"}, false},
- {"ECBVarKey192", []string{"kat", "aes-192-ecb"}, false},
- {"ECBVarKey256", []string{"kat", "aes-256-ecb"}, false},
- {"ECBVarTxt128", []string{"kat", "aes-128-ecb"}, false},
- {"ECBVarTxt192", []string{"kat", "aes-192-ecb"}, false},
- {"ECBVarTxt256", []string{"kat", "aes-256-ecb"}, false},
- // AES Monte-Carlo tests
- {"ECBMCT128", []string{"mct", "aes-128-ecb"}, false},
- {"ECBMCT192", []string{"mct", "aes-192-ecb"}, false},
- {"ECBMCT256", []string{"mct", "aes-256-ecb"}, false},
- {"CBCMCT128", []string{"mct", "aes-128-cbc"}, false},
- {"CBCMCT192", []string{"mct", "aes-192-cbc"}, false},
- {"CBCMCT256", []string{"mct", "aes-256-cbc"}, false},
- },
-}
-
-var ecdsa2KeyPairTests = testSuite{
- "ECDSA2",
- "ecdsa2_keypair",
- nil,
- []test{{"KeyPair", nil, true}},
-}
-
-var ecdsa2PKVTests = testSuite{
- "ECDSA2",
- "ecdsa2_pkv",
- nil,
- []test{{"PKV", nil, false}},
-}
-
-var ecdsa2SigGenTests = testSuite{
- "ECDSA2",
- "ecdsa2_siggen",
- nil,
- []test{
- {"SigGen", []string{"SigGen"}, true},
- {"SigGenComponent", []string{"SigGenComponent"}, true},
- },
-}
-
-var ecdsa2SigVerTests = testSuite{
- "ECDSA2",
- "ecdsa2_sigver",
- nil,
- []test{{"SigVer", nil, false}},
-}
-
-var rsa2KeyGenTests = testSuite{
- "RSA2",
- "rsa2_keygen",
- nil,
- []test{
- {"KeyGen_RandomProbablyPrime3_3", nil, true},
- },
-}
-
-var rsa2SigGenTests = testSuite{
- "RSA2",
- "rsa2_siggen",
- nil,
- []test{
- {"SigGen15_186-3", []string{"pkcs15"}, true},
- {"SigGenPSS_186-3", []string{"pss"}, true},
- },
-}
-
-var rsa2SigVerTests = testSuite{
- "RSA2",
- "rsa2_sigver",
- func(s *bufio.Scanner, state *nextLineState) (string, bool, bool) {
- for {
- if !s.Scan() {
- return "", false, false
- }
-
- line := s.Text()
- if strings.HasPrefix(line, "p = ") || strings.HasPrefix(line, "d = ") || strings.HasPrefix(line, "SaltVal = ") || strings.HasPrefix(line, "EM with ") {
- continue
- }
- if strings.HasPrefix(line, "q = ") {
- // Skip the "q = " line and an additional blank line.
- if !s.Scan() ||
- len(strings.TrimSpace(s.Text())) > 0 {
- return "", false, false
- }
- continue
- }
- return line, false, true
- }
- },
- []test{
- {"SigVer15_186-3", []string{"pkcs15"}, false},
- {"SigVerPSS_186-3", []string{"pss"}, false},
- },
-}
-
-var hmacTests = testSuite{
- "HMAC",
- "hmac",
- nil,
- []test{{"HMAC", nil, false}},
-}
-
-var shaTests = testSuite{
- "SHA",
- "sha",
- nil,
- []test{
- {"SHA1LongMsg", []string{"SHA1"}, false},
- {"SHA1ShortMsg", []string{"SHA1"}, false},
- {"SHA224LongMsg", []string{"SHA224"}, false},
- {"SHA224ShortMsg", []string{"SHA224"}, false},
- {"SHA256LongMsg", []string{"SHA256"}, false},
- {"SHA256ShortMsg", []string{"SHA256"}, false},
- {"SHA384LongMsg", []string{"SHA384"}, false},
- {"SHA384ShortMsg", []string{"SHA384"}, false},
- {"SHA512LongMsg", []string{"SHA512"}, false},
- {"SHA512ShortMsg", []string{"SHA512"}, false},
- },
-}
-
-var shaMonteTests = testSuite{
- "SHA",
- "sha_monte",
- nil,
- []test{
- {"SHA1Monte", []string{"SHA1"}, false},
- {"SHA224Monte", []string{"SHA224"}, false},
- {"SHA256Monte", []string{"SHA256"}, false},
- {"SHA384Monte", []string{"SHA384"}, false},
- {"SHA512Monte", []string{"SHA512"}, false},
- },
-}
-
-var ctrDRBGTests = testSuite{
- "DRBG800-90A",
- "ctr_drbg",
- nil,
- []test{{"CTR_DRBG", nil, false}},
-}
-
-var tdesTests = testSuite{
- "TDES",
- "tdes",
- nil,
- []test{
- {"TCBCMMT2", []string{"kat", "des-ede-cbc"}, false},
- {"TCBCMMT3", []string{"kat", "des-ede3-cbc"}, false},
- {"TCBCMonte2", []string{"mct", "des-ede3-cbc"}, false},
- {"TCBCMonte3", []string{"mct", "des-ede3-cbc"}, false},
- {"TCBCinvperm", []string{"kat", "des-ede3-cbc"}, false},
- {"TCBCpermop", []string{"kat", "des-ede3-cbc"}, false},
- {"TCBCsubtab", []string{"kat", "des-ede3-cbc"}, false},
- {"TCBCvarkey", []string{"kat", "des-ede3-cbc"}, false},
- {"TCBCvartext", []string{"kat", "des-ede3-cbc"}, false},
- {"TECBMMT2", []string{"kat", "des-ede"}, false},
- {"TECBMMT3", []string{"kat", "des-ede3"}, false},
- {"TECBMonte2", []string{"mct", "des-ede3"}, false},
- {"TECBMonte3", []string{"mct", "des-ede3"}, false},
- {"TECBinvperm", []string{"kat", "des-ede3"}, false},
- {"TECBpermop", []string{"kat", "des-ede3"}, false},
- {"TECBsubtab", []string{"kat", "des-ede3"}, false},
- {"TECBvarkey", []string{"kat", "des-ede3"}, false},
- {"TECBvartext", []string{"kat", "des-ede3"}, false},
- },
-}
-
-var keyWrapTests = testSuite{
- "KeyWrap38F",
- "keywrap",
- nil,
- []test{
- {"KW_AD_128", []string{"dec", "128"}, false},
- {"KW_AD_192", []string{"dec", "192"}, false},
- {"KW_AD_256", []string{"dec", "256"}, false},
- {"KW_AE_128", []string{"enc", "128"}, false},
- {"KW_AE_192", []string{"enc", "192"}, false},
- {"KW_AE_256", []string{"enc", "256"}, false},
- {"KWP_AD_128", []string{"dec-pad", "128"}, false},
- {"KWP_AD_192", []string{"dec-pad", "192"}, false},
- {"KWP_AD_256", []string{"dec-pad", "256"}, false},
- {"KWP_AE_128", []string{"enc-pad", "128"}, false},
- {"KWP_AE_192", []string{"enc-pad", "192"}, false},
- {"KWP_AE_256", []string{"enc-pad", "256"}, false},
- },
-}
-
-var kasTests = testSuite{
- "KAS",
- "kas",
- func(s *bufio.Scanner, state *nextLineState) (line string, isWildcard, ok bool) {
- for {
- // If the response file will include the IUT hash next,
- // return a wildcard signal because this cannot be
- // matched against the FAX file.
- if state.nextIsIUTHash {
- state.nextIsIUTHash = false
- return "", true, true
- }
-
- if !s.Scan() {
- return "", false, false
- }
-
- line := s.Text()
- if strings.HasPrefix(line, "deCAVS = ") || strings.HasPrefix(line, "Z = ") {
- continue
- }
- if strings.HasPrefix(line, "CAVSHashZZ = ") {
- state.nextIsIUTHash = true
- }
- return line, false, true
- }
- },
- []test{
- {"KASFunctionTest_ECCEphemeralUnified_NOKC_ZZOnly_init", []string{"function"}, true},
- {"KASFunctionTest_ECCEphemeralUnified_NOKC_ZZOnly_resp", []string{"function"}, true},
- {"KASValidityTest_ECCEphemeralUnified_NOKC_ZZOnly_init", []string{"validity"}, false},
- {"KASValidityTest_ECCEphemeralUnified_NOKC_ZZOnly_resp", []string{"validity"}, false},
- },
-}
-
-var tlsKDFTests = testSuite{
- "KDF135",
- "tlskdf",
- nil,
- []test{
- {"tls", nil, false},
- },
-}
-
-var testSuites = []*testSuite{
- &aesGCMTests,
- &aesTests,
- &ctrDRBGTests,
- &ecdsa2KeyPairTests,
- &ecdsa2PKVTests,
- &ecdsa2SigGenTests,
- &ecdsa2SigVerTests,
- &hmacTests,
- &keyWrapTests,
- &rsa2KeyGenTests,
- &rsa2SigGenTests,
- &rsa2SigVerTests,
- &shaTests,
- &shaMonteTests,
- &tdesTests,
- &kasTests,
- &tlsKDFTests,
-}
-
-// testInstance represents a specific test in a testSuite.
-type testInstance struct {
- suite *testSuite
- testIndex int
-}
-
-func worker(wg *sync.WaitGroup, work <-chan testInstance) {
- defer wg.Done()
-
- for ti := range work {
- test := ti.suite.tests[ti.testIndex]
-
- if err := doTest(ti.suite, test); err != nil {
- fmt.Fprintf(os.Stderr, "%s\n", err)
- os.Exit(2)
- }
-
- if !*noFAX && !test.noFAX {
- if err := compareFAX(ti.suite, test); err != nil {
- fmt.Fprintf(os.Stderr, "%s\n", err)
- os.Exit(3)
- }
- }
- }
-}
-
-func checkAndroidPrereqs() error {
- // The cavp binary, and a matching libcrypto.so, are required to be placed
- // in /data/local/tmp before running this script.
- if err := exec.Command("adb", "shell", "ls", androidCAVPPath).Run(); err != nil {
- return errors.New("failed to list cavp binary; ensure that adb works and cavp binary is in place: " + err.Error())
- }
- if err := exec.Command("adb", "shell", "ls", androidLibCryptoPath).Run(); err != nil {
- return errors.New("failed to list libcrypto.so; ensure that library is in place: " + err.Error())
- }
- return nil
-}
-
-func main() {
- flag.Parse()
-
- if *android {
- if err := checkAndroidPrereqs(); err != nil {
- fmt.Fprintf(os.Stderr, "%s\n", err)
- os.Exit(1)
- }
- } else if len(*oraclePath) == 0 {
- fmt.Fprintf(os.Stderr, "Must give -oracle-bin\n")
- os.Exit(1)
- }
-
- work := make(chan testInstance)
- var wg sync.WaitGroup
-
- numWorkers := runtime.NumCPU()
- if *android {
- numWorkers = 1
- }
-
- for i := 0; i < numWorkers; i++ {
- wg.Add(1)
- go worker(&wg, work)
- }
-
- for _, suite := range testSuites {
- for i := range suite.tests {
- work <- testInstance{suite, i}
- }
- }
-
- close(work)
- wg.Wait()
-}
-
-func doTest(suite *testSuite, test test) error {
- bin := *oraclePath
- var args []string
-
- if *android {
- bin = "adb"
- args = []string{"shell", "LD_LIBRARY_PATH=" + androidTmpPath, androidCAVPPath}
- }
-
- args = append(args, suite.suite)
- args = append(args, test.args...)
- reqPath := filepath.Join(suite.getDirectory(), "req", test.inFile+".req")
- var reqPathOnDevice string
-
- if *android {
- reqPathOnDevice = path.Join(androidTmpPath, test.inFile+".req")
- if err := exec.Command("adb", "push", reqPath, reqPathOnDevice).Run(); err != nil {
- return errors.New("failed to push request file: " + err.Error())
- }
- args = append(args, reqPathOnDevice)
- } else {
- args = append(args, reqPath)
- }
-
- respDir := filepath.Join(suite.getDirectory(), "resp")
- if err := os.Mkdir(respDir, 0755); err != nil && !os.IsExist(err) {
- return fmt.Errorf("cannot create resp directory: %s", err)
- }
- outPath := filepath.Join(respDir, test.inFile+".rsp")
- outFile, err := os.OpenFile(outPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
- if err != nil {
- return fmt.Errorf("cannot open output file for %q %q: %s", suite.getDirectory(), test.inFile, err)
- }
- defer outFile.Close()
-
- cmd := exec.Command(bin, args...)
- cmd.Stdout = outFile
- cmd.Stderr = os.Stderr
-
- cmdLine := strings.Join(append([]string{bin}, args...), " ")
- startTime := time.Now()
- if err := cmd.Run(); err != nil {
- return fmt.Errorf("cannot run command for %q %q (%s): %s", suite.getDirectory(), test.inFile, cmdLine, err)
- }
-
- fmt.Printf("%s (%ds)\n", cmdLine, int(time.Since(startTime).Seconds()))
-
- if *android {
- exec.Command("adb", "shell", "rm", reqPathOnDevice).Run()
- }
-
- return nil
-}
-
-func canonicalizeLine(in string) string {
- if strings.HasPrefix(in, "Result = P (") {
- return "Result = P"
- }
- if strings.HasPrefix(in, "Result = F (") {
- return "Result = F"
- }
- return in
-}
-
-func compareFAX(suite *testSuite, test test) error {
- nextLineFunc := suite.nextLineFunc
- if nextLineFunc == nil {
- nextLineFunc = func(s *bufio.Scanner, state *nextLineState) (string, bool, bool) {
- if !s.Scan() {
- return "", false, false
- }
- return s.Text(), false, true
- }
- }
-
- respPath := filepath.Join(suite.getDirectory(), "resp", test.inFile+".rsp")
- respFile, err := os.Open(respPath)
- if err != nil {
- return fmt.Errorf("cannot read output of %q %q: %s", suite.getDirectory(), test.inFile, err)
- }
- defer respFile.Close()
-
- faxPath := filepath.Join(suite.getDirectory(), "fax", test.inFile+".fax")
- faxFile, err := os.Open(faxPath)
- if err != nil {
- return fmt.Errorf("cannot open fax file for %q %q: %s", suite.getDirectory(), test.inFile, err)
- }
- defer faxFile.Close()
-
- respScanner := bufio.NewScanner(respFile)
- faxScanner := bufio.NewScanner(faxFile)
- var nextLineState nextLineState
-
- lineNo := 0
- inHeader := true
-
- for respScanner.Scan() {
- lineNo++
- respLine := respScanner.Text()
- var faxLine string
- var isWildcard, ok bool
-
- if inHeader && (len(respLine) == 0 || respLine[0] == '#') {
- continue
- }
-
- for {
- haveFaxLine := false
-
- if inHeader {
- for {
- if faxLine, isWildcard, ok = nextLineFunc(faxScanner, &nextLineState); !ok {
- break
- }
- if len(faxLine) != 0 && faxLine[0] != '#' {
- haveFaxLine = true
- break
- }
- }
-
- inHeader = false
- } else {
- faxLine, isWildcard, haveFaxLine = nextLineFunc(faxScanner, &nextLineState)
- }
-
- if !haveFaxLine {
- // Ignore blank lines at the end of the generated file.
- if len(respLine) == 0 {
- break
- }
- return fmt.Errorf("resp file is longer than fax for %q %q", suite.getDirectory(), test.inFile)
- }
-
- if strings.HasPrefix(faxLine, " (Reason: ") {
- continue
- }
-
- break
- }
-
- if isWildcard || canonicalizeLine(faxLine) == canonicalizeLine(respLine) {
- continue
- }
-
- return fmt.Errorf("resp and fax differ at line %d for %q %q: %q vs %q", lineNo, suite.getDirectory(), test.inFile, respLine, faxLine)
- }
-
- if _, _, ok := nextLineFunc(faxScanner, &nextLineState); ok {
- return fmt.Errorf("fax file is longer than resp for %q %q", suite.getDirectory(), test.inFile)
- }
-
- return nil
-}
diff --git a/src/util/fipstools/delocate/delocate.peg b/src/util/fipstools/delocate/delocate.peg
index f79ed76..c253a48 100644
--- a/src/util/fipstools/delocate/delocate.peg
+++ b/src/util/fipstools/delocate/delocate.peg
@@ -44,7 +44,7 @@
SymbolArg <- (OpenParen WS?)? (
Offset /
SymbolType /
- (Offset / LocalSymbol / SymbolName / Dot) WS? Operator WS? (Offset / LocalSymbol / SymbolName) /
+ (Offset / LocalSymbol / SymbolName / Dot) (WS? Operator WS? (Offset / LocalSymbol / SymbolName))* /
LocalSymbol TCMarker? /
SymbolName Offset /
SymbolName TCMarker?)
diff --git a/src/util/fipstools/delocate/delocate.peg.go b/src/util/fipstools/delocate/delocate.peg.go
index 56c4a20..ea7c195 100644
--- a/src/util/fipstools/delocate/delocate.peg.go
+++ b/src/util/fipstools/delocate/delocate.peg.go
@@ -2540,7 +2540,7 @@
position, tokenIndex = position291, tokenIndex291
return false
},
- /* 16 SymbolArg <- <((OpenParen WS?)? (Offset / SymbolType / ((Offset / LocalSymbol / SymbolName / Dot) WS? Operator WS? (Offset / LocalSymbol / SymbolName)) / (LocalSymbol TCMarker?) / (SymbolName Offset) / (SymbolName TCMarker?)) (WS? CloseParen)? (WS? SymbolShift)?)> */
+ /* 16 SymbolArg <- <((OpenParen WS?)? (Offset / SymbolType / ((Offset / LocalSymbol / SymbolName / Dot) (WS? Operator WS? (Offset / LocalSymbol / SymbolName))*) / (LocalSymbol TCMarker?) / (SymbolName Offset) / (SymbolName TCMarker?)) (WS? CloseParen)? (WS? SymbolShift)?)> */
func() bool {
position299, tokenIndex299 := position, tokenIndex
{
@@ -2604,131 +2604,138 @@
}
}
l309:
+ l313:
{
- position313, tokenIndex313 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l313
+ position314, tokenIndex314 := position, tokenIndex
+ {
+ position315, tokenIndex315 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l315
+ }
+ goto l316
+ l315:
+ position, tokenIndex = position315, tokenIndex315
}
- goto l314
- l313:
- position, tokenIndex = position313, tokenIndex313
- }
- l314:
- if !_rules[ruleOperator]() {
- goto l308
- }
- {
- position315, tokenIndex315 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l315
+ l316:
+ if !_rules[ruleOperator]() {
+ goto l314
}
- goto l316
- l315:
- position, tokenIndex = position315, tokenIndex315
- }
- l316:
- {
- position317, tokenIndex317 := position, tokenIndex
- if !_rules[ruleOffset]() {
+ {
+ position317, tokenIndex317 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l317
+ }
goto l318
+ l317:
+ position, tokenIndex = position317, tokenIndex317
}
- goto l317
l318:
- position, tokenIndex = position317, tokenIndex317
- if !_rules[ruleLocalSymbol]() {
+ {
+ position319, tokenIndex319 := position, tokenIndex
+ if !_rules[ruleOffset]() {
+ goto l320
+ }
goto l319
+ l320:
+ position, tokenIndex = position319, tokenIndex319
+ if !_rules[ruleLocalSymbol]() {
+ goto l321
+ }
+ goto l319
+ l321:
+ position, tokenIndex = position319, tokenIndex319
+ if !_rules[ruleSymbolName]() {
+ goto l314
+ }
}
- goto l317
l319:
- position, tokenIndex = position317, tokenIndex317
- if !_rules[ruleSymbolName]() {
- goto l308
- }
+ goto l313
+ l314:
+ position, tokenIndex = position314, tokenIndex314
}
- l317:
goto l305
l308:
position, tokenIndex = position305, tokenIndex305
if !_rules[ruleLocalSymbol]() {
- goto l320
+ goto l322
}
{
- position321, tokenIndex321 := position, tokenIndex
+ position323, tokenIndex323 := position, tokenIndex
if !_rules[ruleTCMarker]() {
- goto l321
+ goto l323
}
- goto l322
- l321:
- position, tokenIndex = position321, tokenIndex321
+ goto l324
+ l323:
+ position, tokenIndex = position323, tokenIndex323
}
- l322:
+ l324:
goto l305
- l320:
+ l322:
position, tokenIndex = position305, tokenIndex305
if !_rules[ruleSymbolName]() {
- goto l323
+ goto l325
}
if !_rules[ruleOffset]() {
- goto l323
+ goto l325
}
goto l305
- l323:
+ l325:
position, tokenIndex = position305, tokenIndex305
if !_rules[ruleSymbolName]() {
goto l299
}
{
- position324, tokenIndex324 := position, tokenIndex
+ position326, tokenIndex326 := position, tokenIndex
if !_rules[ruleTCMarker]() {
- goto l324
+ goto l326
}
- goto l325
- l324:
- position, tokenIndex = position324, tokenIndex324
+ goto l327
+ l326:
+ position, tokenIndex = position326, tokenIndex326
}
- l325:
+ l327:
}
l305:
{
- position326, tokenIndex326 := position, tokenIndex
+ position328, tokenIndex328 := position, tokenIndex
{
- position328, tokenIndex328 := position, tokenIndex
+ position330, tokenIndex330 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l328
+ goto l330
}
- goto l329
- l328:
- position, tokenIndex = position328, tokenIndex328
+ goto l331
+ l330:
+ position, tokenIndex = position330, tokenIndex330
}
- l329:
+ l331:
if !_rules[ruleCloseParen]() {
- goto l326
+ goto l328
}
- goto l327
- l326:
- position, tokenIndex = position326, tokenIndex326
+ goto l329
+ l328:
+ position, tokenIndex = position328, tokenIndex328
}
- l327:
+ l329:
{
- position330, tokenIndex330 := position, tokenIndex
+ position332, tokenIndex332 := position, tokenIndex
{
- position332, tokenIndex332 := position, tokenIndex
+ position334, tokenIndex334 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l332
+ goto l334
}
- goto l333
- l332:
- position, tokenIndex = position332, tokenIndex332
+ goto l335
+ l334:
+ position, tokenIndex = position334, tokenIndex334
}
- l333:
+ l335:
if !_rules[ruleSymbolShift]() {
- goto l330
+ goto l332
}
- goto l331
- l330:
- position, tokenIndex = position330, tokenIndex330
+ goto l333
+ l332:
+ position, tokenIndex = position332, tokenIndex332
}
- l331:
+ l333:
add(ruleSymbolArg, position300)
}
return true
@@ -2738,721 +2745,707 @@
},
/* 17 OpenParen <- <'('> */
func() bool {
- position334, tokenIndex334 := position, tokenIndex
- {
- position335 := position
- if buffer[position] != rune('(') {
- goto l334
- }
- position++
- add(ruleOpenParen, position335)
- }
- return true
- l334:
- position, tokenIndex = position334, tokenIndex334
- return false
- },
- /* 18 CloseParen <- <')'> */
- func() bool {
position336, tokenIndex336 := position, tokenIndex
{
position337 := position
- if buffer[position] != rune(')') {
+ if buffer[position] != rune('(') {
goto l336
}
position++
- add(ruleCloseParen, position337)
+ add(ruleOpenParen, position337)
}
return true
l336:
position, tokenIndex = position336, tokenIndex336
return false
},
- /* 19 SymbolType <- <(('@' / '%') (('f' 'u' 'n' 'c' 't' 'i' 'o' 'n') / ('o' 'b' 'j' 'e' 'c' 't')))> */
+ /* 18 CloseParen <- <')'> */
func() bool {
position338, tokenIndex338 := position, tokenIndex
{
position339 := position
- {
- position340, tokenIndex340 := position, tokenIndex
- if buffer[position] != rune('@') {
- goto l341
- }
- position++
- goto l340
- l341:
- position, tokenIndex = position340, tokenIndex340
- if buffer[position] != rune('%') {
- goto l338
- }
- position++
+ if buffer[position] != rune(')') {
+ goto l338
}
- l340:
- {
- position342, tokenIndex342 := position, tokenIndex
- if buffer[position] != rune('f') {
- goto l343
- }
- position++
- if buffer[position] != rune('u') {
- goto l343
- }
- position++
- if buffer[position] != rune('n') {
- goto l343
- }
- position++
- if buffer[position] != rune('c') {
- goto l343
- }
- position++
- if buffer[position] != rune('t') {
- goto l343
- }
- position++
- if buffer[position] != rune('i') {
- goto l343
- }
- position++
- if buffer[position] != rune('o') {
- goto l343
- }
- position++
- if buffer[position] != rune('n') {
- goto l343
- }
- position++
- goto l342
- l343:
- position, tokenIndex = position342, tokenIndex342
- if buffer[position] != rune('o') {
- goto l338
- }
- position++
- if buffer[position] != rune('b') {
- goto l338
- }
- position++
- if buffer[position] != rune('j') {
- goto l338
- }
- position++
- if buffer[position] != rune('e') {
- goto l338
- }
- position++
- if buffer[position] != rune('c') {
- goto l338
- }
- position++
- if buffer[position] != rune('t') {
- goto l338
- }
- position++
- }
- l342:
- add(ruleSymbolType, position339)
+ position++
+ add(ruleCloseParen, position339)
}
return true
l338:
position, tokenIndex = position338, tokenIndex338
return false
},
- /* 20 Dot <- <'.'> */
+ /* 19 SymbolType <- <(('@' / '%') (('f' 'u' 'n' 'c' 't' 'i' 'o' 'n') / ('o' 'b' 'j' 'e' 'c' 't')))> */
func() bool {
- position344, tokenIndex344 := position, tokenIndex
+ position340, tokenIndex340 := position, tokenIndex
{
- position345 := position
- if buffer[position] != rune('.') {
- goto l344
+ position341 := position
+ {
+ position342, tokenIndex342 := position, tokenIndex
+ if buffer[position] != rune('@') {
+ goto l343
+ }
+ position++
+ goto l342
+ l343:
+ position, tokenIndex = position342, tokenIndex342
+ if buffer[position] != rune('%') {
+ goto l340
+ }
+ position++
}
- position++
- add(ruleDot, position345)
+ l342:
+ {
+ position344, tokenIndex344 := position, tokenIndex
+ if buffer[position] != rune('f') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('u') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('n') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('c') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('t') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('i') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('o') {
+ goto l345
+ }
+ position++
+ if buffer[position] != rune('n') {
+ goto l345
+ }
+ position++
+ goto l344
+ l345:
+ position, tokenIndex = position344, tokenIndex344
+ if buffer[position] != rune('o') {
+ goto l340
+ }
+ position++
+ if buffer[position] != rune('b') {
+ goto l340
+ }
+ position++
+ if buffer[position] != rune('j') {
+ goto l340
+ }
+ position++
+ if buffer[position] != rune('e') {
+ goto l340
+ }
+ position++
+ if buffer[position] != rune('c') {
+ goto l340
+ }
+ position++
+ if buffer[position] != rune('t') {
+ goto l340
+ }
+ position++
+ }
+ l344:
+ add(ruleSymbolType, position341)
}
return true
- l344:
- position, tokenIndex = position344, tokenIndex344
+ l340:
+ position, tokenIndex = position340, tokenIndex340
return false
},
- /* 21 TCMarker <- <('[' 'T' 'C' ']')> */
+ /* 20 Dot <- <'.'> */
func() bool {
position346, tokenIndex346 := position, tokenIndex
{
position347 := position
- if buffer[position] != rune('[') {
+ if buffer[position] != rune('.') {
goto l346
}
position++
- if buffer[position] != rune('T') {
- goto l346
- }
- position++
- if buffer[position] != rune('C') {
- goto l346
- }
- position++
- if buffer[position] != rune(']') {
- goto l346
- }
- position++
- add(ruleTCMarker, position347)
+ add(ruleDot, position347)
}
return true
l346:
position, tokenIndex = position346, tokenIndex346
return false
},
- /* 22 EscapedChar <- <('\\' .)> */
+ /* 21 TCMarker <- <('[' 'T' 'C' ']')> */
func() bool {
position348, tokenIndex348 := position, tokenIndex
{
position349 := position
- if buffer[position] != rune('\\') {
+ if buffer[position] != rune('[') {
goto l348
}
position++
- if !matchDot() {
+ if buffer[position] != rune('T') {
goto l348
}
- add(ruleEscapedChar, position349)
+ position++
+ if buffer[position] != rune('C') {
+ goto l348
+ }
+ position++
+ if buffer[position] != rune(']') {
+ goto l348
+ }
+ position++
+ add(ruleTCMarker, position349)
}
return true
l348:
position, tokenIndex = position348, tokenIndex348
return false
},
- /* 23 WS <- <(' ' / '\t')+> */
+ /* 22 EscapedChar <- <('\\' .)> */
func() bool {
position350, tokenIndex350 := position, tokenIndex
{
position351 := position
- {
- position354, tokenIndex354 := position, tokenIndex
- if buffer[position] != rune(' ') {
- goto l355
- }
- position++
- goto l354
- l355:
- position, tokenIndex = position354, tokenIndex354
- if buffer[position] != rune('\t') {
- goto l350
- }
- position++
+ if buffer[position] != rune('\\') {
+ goto l350
}
- l354:
- l352:
- {
- position353, tokenIndex353 := position, tokenIndex
- {
- position356, tokenIndex356 := position, tokenIndex
- if buffer[position] != rune(' ') {
- goto l357
- }
- position++
- goto l356
- l357:
- position, tokenIndex = position356, tokenIndex356
- if buffer[position] != rune('\t') {
- goto l353
- }
- position++
- }
- l356:
- goto l352
- l353:
- position, tokenIndex = position353, tokenIndex353
+ position++
+ if !matchDot() {
+ goto l350
}
- add(ruleWS, position351)
+ add(ruleEscapedChar, position351)
}
return true
l350:
position, tokenIndex = position350, tokenIndex350
return false
},
- /* 24 Comment <- <((('/' '/') / '#') (!'\n' .)*)> */
+ /* 23 WS <- <(' ' / '\t')+> */
func() bool {
- position358, tokenIndex358 := position, tokenIndex
+ position352, tokenIndex352 := position, tokenIndex
{
- position359 := position
+ position353 := position
{
- position360, tokenIndex360 := position, tokenIndex
- if buffer[position] != rune('/') {
- goto l361
+ position356, tokenIndex356 := position, tokenIndex
+ if buffer[position] != rune(' ') {
+ goto l357
}
position++
- if buffer[position] != rune('/') {
- goto l361
- }
- position++
- goto l360
- l361:
- position, tokenIndex = position360, tokenIndex360
- if buffer[position] != rune('#') {
- goto l358
+ goto l356
+ l357:
+ position, tokenIndex = position356, tokenIndex356
+ if buffer[position] != rune('\t') {
+ goto l352
}
position++
}
- l360:
- l362:
+ l356:
+ l354:
{
- position363, tokenIndex363 := position, tokenIndex
+ position355, tokenIndex355 := position, tokenIndex
{
- position364, tokenIndex364 := position, tokenIndex
- if buffer[position] != rune('\n') {
- goto l364
+ position358, tokenIndex358 := position, tokenIndex
+ if buffer[position] != rune(' ') {
+ goto l359
}
position++
- goto l363
- l364:
- position, tokenIndex = position364, tokenIndex364
+ goto l358
+ l359:
+ position, tokenIndex = position358, tokenIndex358
+ if buffer[position] != rune('\t') {
+ goto l355
+ }
+ position++
}
- if !matchDot() {
- goto l363
- }
- goto l362
- l363:
- position, tokenIndex = position363, tokenIndex363
+ l358:
+ goto l354
+ l355:
+ position, tokenIndex = position355, tokenIndex355
}
- add(ruleComment, position359)
+ add(ruleWS, position353)
}
return true
- l358:
- position, tokenIndex = position358, tokenIndex358
+ l352:
+ position, tokenIndex = position352, tokenIndex352
+ return false
+ },
+ /* 24 Comment <- <((('/' '/') / '#') (!'\n' .)*)> */
+ func() bool {
+ position360, tokenIndex360 := position, tokenIndex
+ {
+ position361 := position
+ {
+ position362, tokenIndex362 := position, tokenIndex
+ if buffer[position] != rune('/') {
+ goto l363
+ }
+ position++
+ if buffer[position] != rune('/') {
+ goto l363
+ }
+ position++
+ goto l362
+ l363:
+ position, tokenIndex = position362, tokenIndex362
+ if buffer[position] != rune('#') {
+ goto l360
+ }
+ position++
+ }
+ l362:
+ l364:
+ {
+ position365, tokenIndex365 := position, tokenIndex
+ {
+ position366, tokenIndex366 := position, tokenIndex
+ if buffer[position] != rune('\n') {
+ goto l366
+ }
+ position++
+ goto l365
+ l366:
+ position, tokenIndex = position366, tokenIndex366
+ }
+ if !matchDot() {
+ goto l365
+ }
+ goto l364
+ l365:
+ position, tokenIndex = position365, tokenIndex365
+ }
+ add(ruleComment, position361)
+ }
+ return true
+ l360:
+ position, tokenIndex = position360, tokenIndex360
return false
},
/* 25 Label <- <((LocalSymbol / LocalLabel / SymbolName) ':')> */
func() bool {
- position365, tokenIndex365 := position, tokenIndex
+ position367, tokenIndex367 := position, tokenIndex
{
- position366 := position
+ position368 := position
{
- position367, tokenIndex367 := position, tokenIndex
+ position369, tokenIndex369 := position, tokenIndex
if !_rules[ruleLocalSymbol]() {
- goto l368
+ goto l370
}
- goto l367
- l368:
- position, tokenIndex = position367, tokenIndex367
+ goto l369
+ l370:
+ position, tokenIndex = position369, tokenIndex369
if !_rules[ruleLocalLabel]() {
- goto l369
+ goto l371
}
- goto l367
- l369:
- position, tokenIndex = position367, tokenIndex367
+ goto l369
+ l371:
+ position, tokenIndex = position369, tokenIndex369
if !_rules[ruleSymbolName]() {
- goto l365
+ goto l367
}
}
- l367:
+ l369:
if buffer[position] != rune(':') {
- goto l365
+ goto l367
}
position++
- add(ruleLabel, position366)
+ add(ruleLabel, position368)
}
return true
- l365:
- position, tokenIndex = position365, tokenIndex365
+ l367:
+ position, tokenIndex = position367, tokenIndex367
return false
},
/* 26 SymbolName <- <(([a-z] / [A-Z] / '.' / '_') ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]) / '$' / '_')*)> */
func() bool {
- position370, tokenIndex370 := position, tokenIndex
+ position372, tokenIndex372 := position, tokenIndex
{
- position371 := position
+ position373 := position
{
- position372, tokenIndex372 := position, tokenIndex
+ position374, tokenIndex374 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l373
- }
- position++
- goto l372
- l373:
- position, tokenIndex = position372, tokenIndex372
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l374
- }
- position++
- goto l372
- l374:
- position, tokenIndex = position372, tokenIndex372
- if buffer[position] != rune('.') {
goto l375
}
position++
- goto l372
+ goto l374
l375:
- position, tokenIndex = position372, tokenIndex372
+ position, tokenIndex = position374, tokenIndex374
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l376
+ }
+ position++
+ goto l374
+ l376:
+ position, tokenIndex = position374, tokenIndex374
+ if buffer[position] != rune('.') {
+ goto l377
+ }
+ position++
+ goto l374
+ l377:
+ position, tokenIndex = position374, tokenIndex374
if buffer[position] != rune('_') {
- goto l370
+ goto l372
}
position++
}
- l372:
- l376:
+ l374:
+ l378:
{
- position377, tokenIndex377 := position, tokenIndex
+ position379, tokenIndex379 := position, tokenIndex
{
- position378, tokenIndex378 := position, tokenIndex
+ position380, tokenIndex380 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l379
- }
- position++
- goto l378
- l379:
- position, tokenIndex = position378, tokenIndex378
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l380
- }
- position++
- goto l378
- l380:
- position, tokenIndex = position378, tokenIndex378
- if buffer[position] != rune('.') {
goto l381
}
position++
- goto l378
+ goto l380
l381:
- position, tokenIndex = position378, tokenIndex378
+ position, tokenIndex = position380, tokenIndex380
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l382
+ }
+ position++
+ goto l380
+ l382:
+ position, tokenIndex = position380, tokenIndex380
+ if buffer[position] != rune('.') {
+ goto l383
+ }
+ position++
+ goto l380
+ l383:
+ position, tokenIndex = position380, tokenIndex380
{
- position383, tokenIndex383 := position, tokenIndex
+ position385, tokenIndex385 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l386
+ }
+ position++
+ goto l385
+ l386:
+ position, tokenIndex = position385, tokenIndex385
if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l384
}
position++
- goto l383
- l384:
- position, tokenIndex = position383, tokenIndex383
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l382
- }
- position++
}
- l383:
- goto l378
- l382:
- position, tokenIndex = position378, tokenIndex378
+ l385:
+ goto l380
+ l384:
+ position, tokenIndex = position380, tokenIndex380
if buffer[position] != rune('$') {
- goto l385
+ goto l387
}
position++
- goto l378
- l385:
- position, tokenIndex = position378, tokenIndex378
+ goto l380
+ l387:
+ position, tokenIndex = position380, tokenIndex380
if buffer[position] != rune('_') {
- goto l377
+ goto l379
}
position++
}
- l378:
- goto l376
- l377:
- position, tokenIndex = position377, tokenIndex377
+ l380:
+ goto l378
+ l379:
+ position, tokenIndex = position379, tokenIndex379
}
- add(ruleSymbolName, position371)
+ add(ruleSymbolName, position373)
}
return true
- l370:
- position, tokenIndex = position370, tokenIndex370
+ l372:
+ position, tokenIndex = position372, tokenIndex372
return false
},
/* 27 LocalSymbol <- <('.' 'L' ([a-z] / [A-Z] / ([a-z] / [A-Z]) / '.' / ([0-9] / [0-9]) / '$' / '_')+)> */
func() bool {
- position386, tokenIndex386 := position, tokenIndex
+ position388, tokenIndex388 := position, tokenIndex
{
- position387 := position
+ position389 := position
if buffer[position] != rune('.') {
- goto l386
+ goto l388
}
position++
if buffer[position] != rune('L') {
- goto l386
+ goto l388
}
position++
{
- position390, tokenIndex390 := position, tokenIndex
+ position392, tokenIndex392 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l391
+ goto l393
}
position++
- goto l390
- l391:
- position, tokenIndex = position390, tokenIndex390
+ goto l392
+ l393:
+ position, tokenIndex = position392, tokenIndex392
if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l392
+ goto l394
}
position++
- goto l390
- l392:
- position, tokenIndex = position390, tokenIndex390
+ goto l392
+ l394:
+ position, tokenIndex = position392, tokenIndex392
{
- position394, tokenIndex394 := position, tokenIndex
+ position396, tokenIndex396 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
+ goto l397
+ }
+ position++
+ goto l396
+ l397:
+ position, tokenIndex = position396, tokenIndex396
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
goto l395
}
position++
- goto l394
- l395:
- position, tokenIndex = position394, tokenIndex394
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l393
- }
- position++
}
- l394:
- goto l390
- l393:
- position, tokenIndex = position390, tokenIndex390
+ l396:
+ goto l392
+ l395:
+ position, tokenIndex = position392, tokenIndex392
if buffer[position] != rune('.') {
- goto l396
+ goto l398
}
position++
- goto l390
- l396:
- position, tokenIndex = position390, tokenIndex390
+ goto l392
+ l398:
+ position, tokenIndex = position392, tokenIndex392
{
- position398, tokenIndex398 := position, tokenIndex
+ position400, tokenIndex400 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l401
+ }
+ position++
+ goto l400
+ l401:
+ position, tokenIndex = position400, tokenIndex400
if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l399
}
position++
- goto l398
- l399:
- position, tokenIndex = position398, tokenIndex398
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l397
- }
- position++
}
- l398:
- goto l390
- l397:
- position, tokenIndex = position390, tokenIndex390
+ l400:
+ goto l392
+ l399:
+ position, tokenIndex = position392, tokenIndex392
if buffer[position] != rune('$') {
- goto l400
+ goto l402
}
position++
- goto l390
- l400:
- position, tokenIndex = position390, tokenIndex390
+ goto l392
+ l402:
+ position, tokenIndex = position392, tokenIndex392
if buffer[position] != rune('_') {
- goto l386
+ goto l388
}
position++
}
+ l392:
l390:
- l388:
{
- position389, tokenIndex389 := position, tokenIndex
+ position391, tokenIndex391 := position, tokenIndex
{
- position401, tokenIndex401 := position, tokenIndex
+ position403, tokenIndex403 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l402
+ goto l404
}
position++
- goto l401
- l402:
- position, tokenIndex = position401, tokenIndex401
+ goto l403
+ l404:
+ position, tokenIndex = position403, tokenIndex403
if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l403
+ goto l405
}
position++
- goto l401
- l403:
- position, tokenIndex = position401, tokenIndex401
+ goto l403
+ l405:
+ position, tokenIndex = position403, tokenIndex403
{
- position405, tokenIndex405 := position, tokenIndex
+ position407, tokenIndex407 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
+ goto l408
+ }
+ position++
+ goto l407
+ l408:
+ position, tokenIndex = position407, tokenIndex407
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
goto l406
}
position++
- goto l405
- l406:
- position, tokenIndex = position405, tokenIndex405
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l404
- }
- position++
}
- l405:
- goto l401
- l404:
- position, tokenIndex = position401, tokenIndex401
+ l407:
+ goto l403
+ l406:
+ position, tokenIndex = position403, tokenIndex403
if buffer[position] != rune('.') {
- goto l407
+ goto l409
}
position++
- goto l401
- l407:
- position, tokenIndex = position401, tokenIndex401
+ goto l403
+ l409:
+ position, tokenIndex = position403, tokenIndex403
{
- position409, tokenIndex409 := position, tokenIndex
+ position411, tokenIndex411 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l412
+ }
+ position++
+ goto l411
+ l412:
+ position, tokenIndex = position411, tokenIndex411
if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l410
}
position++
- goto l409
- l410:
- position, tokenIndex = position409, tokenIndex409
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l408
- }
- position++
}
- l409:
- goto l401
- l408:
- position, tokenIndex = position401, tokenIndex401
+ l411:
+ goto l403
+ l410:
+ position, tokenIndex = position403, tokenIndex403
if buffer[position] != rune('$') {
- goto l411
+ goto l413
}
position++
- goto l401
- l411:
- position, tokenIndex = position401, tokenIndex401
+ goto l403
+ l413:
+ position, tokenIndex = position403, tokenIndex403
if buffer[position] != rune('_') {
- goto l389
+ goto l391
}
position++
}
- l401:
- goto l388
- l389:
- position, tokenIndex = position389, tokenIndex389
+ l403:
+ goto l390
+ l391:
+ position, tokenIndex = position391, tokenIndex391
}
- add(ruleLocalSymbol, position387)
+ add(ruleLocalSymbol, position389)
}
return true
- l386:
- position, tokenIndex = position386, tokenIndex386
+ l388:
+ position, tokenIndex = position388, tokenIndex388
return false
},
/* 28 LocalLabel <- <([0-9] ([0-9] / '$')*)> */
func() bool {
- position412, tokenIndex412 := position, tokenIndex
+ position414, tokenIndex414 := position, tokenIndex
{
- position413 := position
+ position415 := position
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l412
+ goto l414
}
position++
- l414:
+ l416:
{
- position415, tokenIndex415 := position, tokenIndex
+ position417, tokenIndex417 := position, tokenIndex
{
- position416, tokenIndex416 := position, tokenIndex
+ position418, tokenIndex418 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l419
+ }
+ position++
+ goto l418
+ l419:
+ position, tokenIndex = position418, tokenIndex418
+ if buffer[position] != rune('$') {
goto l417
}
position++
- goto l416
- l417:
- position, tokenIndex = position416, tokenIndex416
- if buffer[position] != rune('$') {
- goto l415
- }
- position++
}
- l416:
- goto l414
- l415:
- position, tokenIndex = position415, tokenIndex415
+ l418:
+ goto l416
+ l417:
+ position, tokenIndex = position417, tokenIndex417
}
- add(ruleLocalLabel, position413)
+ add(ruleLocalLabel, position415)
}
return true
- l412:
- position, tokenIndex = position412, tokenIndex412
+ l414:
+ position, tokenIndex = position414, tokenIndex414
return false
},
/* 29 LocalLabelRef <- <([0-9] ([0-9] / '$')* ('b' / 'f'))> */
func() bool {
- position418, tokenIndex418 := position, tokenIndex
+ position420, tokenIndex420 := position, tokenIndex
{
- position419 := position
+ position421 := position
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l418
+ goto l420
}
position++
- l420:
+ l422:
{
- position421, tokenIndex421 := position, tokenIndex
+ position423, tokenIndex423 := position, tokenIndex
{
- position422, tokenIndex422 := position, tokenIndex
+ position424, tokenIndex424 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l425
+ }
+ position++
+ goto l424
+ l425:
+ position, tokenIndex = position424, tokenIndex424
+ if buffer[position] != rune('$') {
goto l423
}
position++
- goto l422
- l423:
- position, tokenIndex = position422, tokenIndex422
- if buffer[position] != rune('$') {
- goto l421
- }
- position++
}
- l422:
- goto l420
- l421:
- position, tokenIndex = position421, tokenIndex421
+ l424:
+ goto l422
+ l423:
+ position, tokenIndex = position423, tokenIndex423
}
{
- position424, tokenIndex424 := position, tokenIndex
+ position426, tokenIndex426 := position, tokenIndex
if buffer[position] != rune('b') {
- goto l425
+ goto l427
}
position++
- goto l424
- l425:
- position, tokenIndex = position424, tokenIndex424
+ goto l426
+ l427:
+ position, tokenIndex = position426, tokenIndex426
if buffer[position] != rune('f') {
- goto l418
+ goto l420
}
position++
}
- l424:
- add(ruleLocalLabelRef, position419)
+ l426:
+ add(ruleLocalLabelRef, position421)
}
return true
- l418:
- position, tokenIndex = position418, tokenIndex418
+ l420:
+ position, tokenIndex = position420, tokenIndex420
return false
},
/* 30 Instruction <- <(InstructionName (WS InstructionArg (WS? ',' WS? InstructionArg)*)?)> */
func() bool {
- position426, tokenIndex426 := position, tokenIndex
+ position428, tokenIndex428 := position, tokenIndex
{
- position427 := position
+ position429 := position
if !_rules[ruleInstructionName]() {
- goto l426
+ goto l428
}
{
- position428, tokenIndex428 := position, tokenIndex
+ position430, tokenIndex430 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l428
+ goto l430
}
if !_rules[ruleInstructionArg]() {
- goto l428
+ goto l430
}
- l430:
+ l432:
{
- position431, tokenIndex431 := position, tokenIndex
- {
- position432, tokenIndex432 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l432
- }
- goto l433
- l432:
- position, tokenIndex = position432, tokenIndex432
- }
- l433:
- if buffer[position] != rune(',') {
- goto l431
- }
- position++
+ position433, tokenIndex433 := position, tokenIndex
{
position434, tokenIndex434 := position, tokenIndex
if !_rules[ruleWS]() {
@@ -3463,1013 +3456,1009 @@
position, tokenIndex = position434, tokenIndex434
}
l435:
- if !_rules[ruleInstructionArg]() {
- goto l431
+ if buffer[position] != rune(',') {
+ goto l433
}
- goto l430
- l431:
- position, tokenIndex = position431, tokenIndex431
+ position++
+ {
+ position436, tokenIndex436 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l436
+ }
+ goto l437
+ l436:
+ position, tokenIndex = position436, tokenIndex436
+ }
+ l437:
+ if !_rules[ruleInstructionArg]() {
+ goto l433
+ }
+ goto l432
+ l433:
+ position, tokenIndex = position433, tokenIndex433
}
- goto l429
- l428:
- position, tokenIndex = position428, tokenIndex428
+ goto l431
+ l430:
+ position, tokenIndex = position430, tokenIndex430
}
- l429:
- add(ruleInstruction, position427)
+ l431:
+ add(ruleInstruction, position429)
}
return true
- l426:
- position, tokenIndex = position426, tokenIndex426
+ l428:
+ position, tokenIndex = position428, tokenIndex428
return false
},
/* 31 InstructionName <- <(([a-z] / [A-Z]) ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]))* ('.' / '+' / '-')?)> */
func() bool {
- position436, tokenIndex436 := position, tokenIndex
+ position438, tokenIndex438 := position, tokenIndex
{
- position437 := position
+ position439 := position
{
- position438, tokenIndex438 := position, tokenIndex
+ position440, tokenIndex440 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l439
+ goto l441
}
position++
- goto l438
- l439:
- position, tokenIndex = position438, tokenIndex438
+ goto l440
+ l441:
+ position, tokenIndex = position440, tokenIndex440
if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l436
+ goto l438
}
position++
}
- l438:
l440:
+ l442:
{
- position441, tokenIndex441 := position, tokenIndex
+ position443, tokenIndex443 := position, tokenIndex
{
- position442, tokenIndex442 := position, tokenIndex
+ position444, tokenIndex444 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l443
- }
- position++
- goto l442
- l443:
- position, tokenIndex = position442, tokenIndex442
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l444
- }
- position++
- goto l442
- l444:
- position, tokenIndex = position442, tokenIndex442
- if buffer[position] != rune('.') {
goto l445
}
position++
- goto l442
+ goto l444
l445:
- position, tokenIndex = position442, tokenIndex442
+ position, tokenIndex = position444, tokenIndex444
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l446
+ }
+ position++
+ goto l444
+ l446:
+ position, tokenIndex = position444, tokenIndex444
+ if buffer[position] != rune('.') {
+ goto l447
+ }
+ position++
+ goto l444
+ l447:
+ position, tokenIndex = position444, tokenIndex444
{
- position446, tokenIndex446 := position, tokenIndex
+ position448, tokenIndex448 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l447
+ goto l449
}
position++
- goto l446
- l447:
- position, tokenIndex = position446, tokenIndex446
+ goto l448
+ l449:
+ position, tokenIndex = position448, tokenIndex448
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l441
+ goto l443
}
position++
}
- l446:
+ l448:
}
- l442:
- goto l440
- l441:
- position, tokenIndex = position441, tokenIndex441
+ l444:
+ goto l442
+ l443:
+ position, tokenIndex = position443, tokenIndex443
}
{
- position448, tokenIndex448 := position, tokenIndex
+ position450, tokenIndex450 := position, tokenIndex
{
- position450, tokenIndex450 := position, tokenIndex
+ position452, tokenIndex452 := position, tokenIndex
if buffer[position] != rune('.') {
- goto l451
+ goto l453
}
position++
- goto l450
- l451:
- position, tokenIndex = position450, tokenIndex450
+ goto l452
+ l453:
+ position, tokenIndex = position452, tokenIndex452
if buffer[position] != rune('+') {
- goto l452
+ goto l454
}
position++
- goto l450
- l452:
- position, tokenIndex = position450, tokenIndex450
+ goto l452
+ l454:
+ position, tokenIndex = position452, tokenIndex452
if buffer[position] != rune('-') {
- goto l448
+ goto l450
}
position++
}
+ l452:
+ goto l451
l450:
- goto l449
- l448:
- position, tokenIndex = position448, tokenIndex448
+ position, tokenIndex = position450, tokenIndex450
}
- l449:
- add(ruleInstructionName, position437)
+ l451:
+ add(ruleInstructionName, position439)
}
return true
- l436:
- position, tokenIndex = position436, tokenIndex436
+ l438:
+ position, tokenIndex = position438, tokenIndex438
return false
},
/* 32 InstructionArg <- <(IndirectionIndicator? (ARMConstantTweak / RegisterOrConstant / LocalLabelRef / TOCRefHigh / TOCRefLow / GOTLocation / GOTSymbolOffset / MemoryRef) AVX512Token*)> */
func() bool {
- position453, tokenIndex453 := position, tokenIndex
+ position455, tokenIndex455 := position, tokenIndex
{
- position454 := position
- {
- position455, tokenIndex455 := position, tokenIndex
- if !_rules[ruleIndirectionIndicator]() {
- goto l455
- }
- goto l456
- l455:
- position, tokenIndex = position455, tokenIndex455
- }
- l456:
+ position456 := position
{
position457, tokenIndex457 := position, tokenIndex
+ if !_rules[ruleIndirectionIndicator]() {
+ goto l457
+ }
+ goto l458
+ l457:
+ position, tokenIndex = position457, tokenIndex457
+ }
+ l458:
+ {
+ position459, tokenIndex459 := position, tokenIndex
if !_rules[ruleARMConstantTweak]() {
- goto l458
- }
- goto l457
- l458:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleRegisterOrConstant]() {
- goto l459
- }
- goto l457
- l459:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleLocalLabelRef]() {
goto l460
}
- goto l457
+ goto l459
l460:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleTOCRefHigh]() {
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleRegisterOrConstant]() {
goto l461
}
- goto l457
+ goto l459
l461:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleTOCRefLow]() {
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleLocalLabelRef]() {
goto l462
}
- goto l457
+ goto l459
l462:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleGOTLocation]() {
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleTOCRefHigh]() {
goto l463
}
- goto l457
+ goto l459
l463:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleGOTSymbolOffset]() {
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleTOCRefLow]() {
goto l464
}
- goto l457
+ goto l459
l464:
- position, tokenIndex = position457, tokenIndex457
- if !_rules[ruleMemoryRef]() {
- goto l453
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleGOTLocation]() {
+ goto l465
}
- }
- l457:
- l465:
- {
- position466, tokenIndex466 := position, tokenIndex
- if !_rules[ruleAVX512Token]() {
+ goto l459
+ l465:
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleGOTSymbolOffset]() {
goto l466
}
- goto l465
+ goto l459
l466:
- position, tokenIndex = position466, tokenIndex466
+ position, tokenIndex = position459, tokenIndex459
+ if !_rules[ruleMemoryRef]() {
+ goto l455
+ }
}
- add(ruleInstructionArg, position454)
+ l459:
+ l467:
+ {
+ position468, tokenIndex468 := position, tokenIndex
+ if !_rules[ruleAVX512Token]() {
+ goto l468
+ }
+ goto l467
+ l468:
+ position, tokenIndex = position468, tokenIndex468
+ }
+ add(ruleInstructionArg, position456)
}
return true
- l453:
- position, tokenIndex = position453, tokenIndex453
+ l455:
+ position, tokenIndex = position455, tokenIndex455
return false
},
/* 33 GOTLocation <- <('$' '_' 'G' 'L' 'O' 'B' 'A' 'L' '_' 'O' 'F' 'F' 'S' 'E' 'T' '_' 'T' 'A' 'B' 'L' 'E' '_' '-' LocalSymbol)> */
func() bool {
- position467, tokenIndex467 := position, tokenIndex
- {
- position468 := position
- if buffer[position] != rune('$') {
- goto l467
- }
- position++
- if buffer[position] != rune('_') {
- goto l467
- }
- position++
- if buffer[position] != rune('G') {
- goto l467
- }
- position++
- if buffer[position] != rune('L') {
- goto l467
- }
- position++
- if buffer[position] != rune('O') {
- goto l467
- }
- position++
- if buffer[position] != rune('B') {
- goto l467
- }
- position++
- if buffer[position] != rune('A') {
- goto l467
- }
- position++
- if buffer[position] != rune('L') {
- goto l467
- }
- position++
- if buffer[position] != rune('_') {
- goto l467
- }
- position++
- if buffer[position] != rune('O') {
- goto l467
- }
- position++
- if buffer[position] != rune('F') {
- goto l467
- }
- position++
- if buffer[position] != rune('F') {
- goto l467
- }
- position++
- if buffer[position] != rune('S') {
- goto l467
- }
- position++
- if buffer[position] != rune('E') {
- goto l467
- }
- position++
- if buffer[position] != rune('T') {
- goto l467
- }
- position++
- if buffer[position] != rune('_') {
- goto l467
- }
- position++
- if buffer[position] != rune('T') {
- goto l467
- }
- position++
- if buffer[position] != rune('A') {
- goto l467
- }
- position++
- if buffer[position] != rune('B') {
- goto l467
- }
- position++
- if buffer[position] != rune('L') {
- goto l467
- }
- position++
- if buffer[position] != rune('E') {
- goto l467
- }
- position++
- if buffer[position] != rune('_') {
- goto l467
- }
- position++
- if buffer[position] != rune('-') {
- goto l467
- }
- position++
- if !_rules[ruleLocalSymbol]() {
- goto l467
- }
- add(ruleGOTLocation, position468)
- }
- return true
- l467:
- position, tokenIndex = position467, tokenIndex467
- return false
- },
- /* 34 GOTSymbolOffset <- <(('$' SymbolName ('@' 'G' 'O' 'T') ('O' 'F' 'F')?) / (':' ('g' / 'G') ('o' / 'O') ('t' / 'T') ':' SymbolName))> */
- func() bool {
position469, tokenIndex469 := position, tokenIndex
{
position470 := position
- {
- position471, tokenIndex471 := position, tokenIndex
- if buffer[position] != rune('$') {
- goto l472
- }
- position++
- if !_rules[ruleSymbolName]() {
- goto l472
- }
- if buffer[position] != rune('@') {
- goto l472
- }
- position++
- if buffer[position] != rune('G') {
- goto l472
- }
- position++
- if buffer[position] != rune('O') {
- goto l472
- }
- position++
- if buffer[position] != rune('T') {
- goto l472
- }
- position++
- {
- position473, tokenIndex473 := position, tokenIndex
- if buffer[position] != rune('O') {
- goto l473
- }
- position++
- if buffer[position] != rune('F') {
- goto l473
- }
- position++
- if buffer[position] != rune('F') {
- goto l473
- }
- position++
- goto l474
- l473:
- position, tokenIndex = position473, tokenIndex473
- }
- l474:
- goto l471
- l472:
- position, tokenIndex = position471, tokenIndex471
- if buffer[position] != rune(':') {
- goto l469
- }
- position++
- {
- position475, tokenIndex475 := position, tokenIndex
- if buffer[position] != rune('g') {
- goto l476
- }
- position++
- goto l475
- l476:
- position, tokenIndex = position475, tokenIndex475
- if buffer[position] != rune('G') {
- goto l469
- }
- position++
- }
- l475:
- {
- position477, tokenIndex477 := position, tokenIndex
- if buffer[position] != rune('o') {
- goto l478
- }
- position++
- goto l477
- l478:
- position, tokenIndex = position477, tokenIndex477
- if buffer[position] != rune('O') {
- goto l469
- }
- position++
- }
- l477:
- {
- position479, tokenIndex479 := position, tokenIndex
- if buffer[position] != rune('t') {
- goto l480
- }
- position++
- goto l479
- l480:
- position, tokenIndex = position479, tokenIndex479
- if buffer[position] != rune('T') {
- goto l469
- }
- position++
- }
- l479:
- if buffer[position] != rune(':') {
- goto l469
- }
- position++
- if !_rules[ruleSymbolName]() {
- goto l469
- }
+ if buffer[position] != rune('$') {
+ goto l469
}
- l471:
- add(ruleGOTSymbolOffset, position470)
+ position++
+ if buffer[position] != rune('_') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('G') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('L') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('O') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('B') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('A') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('L') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('_') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('O') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('F') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('F') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('S') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('E') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('T') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('_') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('T') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('A') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('B') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('L') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('E') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('_') {
+ goto l469
+ }
+ position++
+ if buffer[position] != rune('-') {
+ goto l469
+ }
+ position++
+ if !_rules[ruleLocalSymbol]() {
+ goto l469
+ }
+ add(ruleGOTLocation, position470)
}
return true
l469:
position, tokenIndex = position469, tokenIndex469
return false
},
- /* 35 AVX512Token <- <(WS? '{' '%'? ([0-9] / [a-z])* '}')> */
+ /* 34 GOTSymbolOffset <- <(('$' SymbolName ('@' 'G' 'O' 'T') ('O' 'F' 'F')?) / (':' ('g' / 'G') ('o' / 'O') ('t' / 'T') ':' SymbolName))> */
func() bool {
- position481, tokenIndex481 := position, tokenIndex
+ position471, tokenIndex471 := position, tokenIndex
{
- position482 := position
+ position472 := position
{
- position483, tokenIndex483 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l483
- }
- goto l484
- l483:
- position, tokenIndex = position483, tokenIndex483
- }
- l484:
- if buffer[position] != rune('{') {
- goto l481
- }
- position++
- {
- position485, tokenIndex485 := position, tokenIndex
- if buffer[position] != rune('%') {
- goto l485
+ position473, tokenIndex473 := position, tokenIndex
+ if buffer[position] != rune('$') {
+ goto l474
}
position++
+ if !_rules[ruleSymbolName]() {
+ goto l474
+ }
+ if buffer[position] != rune('@') {
+ goto l474
+ }
+ position++
+ if buffer[position] != rune('G') {
+ goto l474
+ }
+ position++
+ if buffer[position] != rune('O') {
+ goto l474
+ }
+ position++
+ if buffer[position] != rune('T') {
+ goto l474
+ }
+ position++
+ {
+ position475, tokenIndex475 := position, tokenIndex
+ if buffer[position] != rune('O') {
+ goto l475
+ }
+ position++
+ if buffer[position] != rune('F') {
+ goto l475
+ }
+ position++
+ if buffer[position] != rune('F') {
+ goto l475
+ }
+ position++
+ goto l476
+ l475:
+ position, tokenIndex = position475, tokenIndex475
+ }
+ l476:
+ goto l473
+ l474:
+ position, tokenIndex = position473, tokenIndex473
+ if buffer[position] != rune(':') {
+ goto l471
+ }
+ position++
+ {
+ position477, tokenIndex477 := position, tokenIndex
+ if buffer[position] != rune('g') {
+ goto l478
+ }
+ position++
+ goto l477
+ l478:
+ position, tokenIndex = position477, tokenIndex477
+ if buffer[position] != rune('G') {
+ goto l471
+ }
+ position++
+ }
+ l477:
+ {
+ position479, tokenIndex479 := position, tokenIndex
+ if buffer[position] != rune('o') {
+ goto l480
+ }
+ position++
+ goto l479
+ l480:
+ position, tokenIndex = position479, tokenIndex479
+ if buffer[position] != rune('O') {
+ goto l471
+ }
+ position++
+ }
+ l479:
+ {
+ position481, tokenIndex481 := position, tokenIndex
+ if buffer[position] != rune('t') {
+ goto l482
+ }
+ position++
+ goto l481
+ l482:
+ position, tokenIndex = position481, tokenIndex481
+ if buffer[position] != rune('T') {
+ goto l471
+ }
+ position++
+ }
+ l481:
+ if buffer[position] != rune(':') {
+ goto l471
+ }
+ position++
+ if !_rules[ruleSymbolName]() {
+ goto l471
+ }
+ }
+ l473:
+ add(ruleGOTSymbolOffset, position472)
+ }
+ return true
+ l471:
+ position, tokenIndex = position471, tokenIndex471
+ return false
+ },
+ /* 35 AVX512Token <- <(WS? '{' '%'? ([0-9] / [a-z])* '}')> */
+ func() bool {
+ position483, tokenIndex483 := position, tokenIndex
+ {
+ position484 := position
+ {
+ position485, tokenIndex485 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l485
+ }
goto l486
l485:
position, tokenIndex = position485, tokenIndex485
}
l486:
- l487:
+ if buffer[position] != rune('{') {
+ goto l483
+ }
+ position++
{
- position488, tokenIndex488 := position, tokenIndex
+ position487, tokenIndex487 := position, tokenIndex
+ if buffer[position] != rune('%') {
+ goto l487
+ }
+ position++
+ goto l488
+ l487:
+ position, tokenIndex = position487, tokenIndex487
+ }
+ l488:
+ l489:
+ {
+ position490, tokenIndex490 := position, tokenIndex
{
- position489, tokenIndex489 := position, tokenIndex
+ position491, tokenIndex491 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l492
+ }
+ position++
+ goto l491
+ l492:
+ position, tokenIndex = position491, tokenIndex491
+ if c := buffer[position]; c < rune('a') || c > rune('z') {
goto l490
}
position++
- goto l489
- l490:
- position, tokenIndex = position489, tokenIndex489
- if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l488
- }
- position++
}
- l489:
- goto l487
- l488:
- position, tokenIndex = position488, tokenIndex488
+ l491:
+ goto l489
+ l490:
+ position, tokenIndex = position490, tokenIndex490
}
if buffer[position] != rune('}') {
- goto l481
+ goto l483
}
position++
- add(ruleAVX512Token, position482)
+ add(ruleAVX512Token, position484)
}
return true
- l481:
- position, tokenIndex = position481, tokenIndex481
+ l483:
+ position, tokenIndex = position483, tokenIndex483
return false
},
/* 36 TOCRefHigh <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('h' / 'H') ('a' / 'A')))> */
func() bool {
- position491, tokenIndex491 := position, tokenIndex
+ position493, tokenIndex493 := position, tokenIndex
{
- position492 := position
+ position494 := position
if buffer[position] != rune('.') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('T') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('O') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('C') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('.') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('-') {
- goto l491
+ goto l493
}
position++
{
- position493, tokenIndex493 := position, tokenIndex
+ position495, tokenIndex495 := position, tokenIndex
if buffer[position] != rune('0') {
- goto l494
+ goto l496
}
position++
if buffer[position] != rune('b') {
- goto l494
+ goto l496
}
position++
- goto l493
- l494:
- position, tokenIndex = position493, tokenIndex493
+ goto l495
+ l496:
+ position, tokenIndex = position495, tokenIndex495
if buffer[position] != rune('.') {
- goto l491
+ goto l493
}
position++
if buffer[position] != rune('L') {
- goto l491
+ goto l493
}
position++
{
- position497, tokenIndex497 := position, tokenIndex
+ position499, tokenIndex499 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l498
- }
- position++
- goto l497
- l498:
- position, tokenIndex = position497, tokenIndex497
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l499
- }
- position++
- goto l497
- l499:
- position, tokenIndex = position497, tokenIndex497
- if buffer[position] != rune('_') {
goto l500
}
position++
- goto l497
+ goto l499
l500:
- position, tokenIndex = position497, tokenIndex497
+ position, tokenIndex = position499, tokenIndex499
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l501
+ }
+ position++
+ goto l499
+ l501:
+ position, tokenIndex = position499, tokenIndex499
+ if buffer[position] != rune('_') {
+ goto l502
+ }
+ position++
+ goto l499
+ l502:
+ position, tokenIndex = position499, tokenIndex499
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l491
+ goto l493
}
position++
}
+ l499:
l497:
- l495:
{
- position496, tokenIndex496 := position, tokenIndex
+ position498, tokenIndex498 := position, tokenIndex
{
- position501, tokenIndex501 := position, tokenIndex
+ position503, tokenIndex503 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l502
- }
- position++
- goto l501
- l502:
- position, tokenIndex = position501, tokenIndex501
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l503
- }
- position++
- goto l501
- l503:
- position, tokenIndex = position501, tokenIndex501
- if buffer[position] != rune('_') {
goto l504
}
position++
- goto l501
+ goto l503
l504:
- position, tokenIndex = position501, tokenIndex501
+ position, tokenIndex = position503, tokenIndex503
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l505
+ }
+ position++
+ goto l503
+ l505:
+ position, tokenIndex = position503, tokenIndex503
+ if buffer[position] != rune('_') {
+ goto l506
+ }
+ position++
+ goto l503
+ l506:
+ position, tokenIndex = position503, tokenIndex503
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l496
+ goto l498
}
position++
}
- l501:
- goto l495
- l496:
- position, tokenIndex = position496, tokenIndex496
+ l503:
+ goto l497
+ l498:
+ position, tokenIndex = position498, tokenIndex498
}
}
- l493:
+ l495:
if buffer[position] != rune('@') {
- goto l491
+ goto l493
}
position++
{
- position505, tokenIndex505 := position, tokenIndex
- if buffer[position] != rune('h') {
- goto l506
- }
- position++
- goto l505
- l506:
- position, tokenIndex = position505, tokenIndex505
- if buffer[position] != rune('H') {
- goto l491
- }
- position++
- }
- l505:
- {
position507, tokenIndex507 := position, tokenIndex
- if buffer[position] != rune('a') {
+ if buffer[position] != rune('h') {
goto l508
}
position++
goto l507
l508:
position, tokenIndex = position507, tokenIndex507
- if buffer[position] != rune('A') {
- goto l491
+ if buffer[position] != rune('H') {
+ goto l493
}
position++
}
l507:
- add(ruleTOCRefHigh, position492)
+ {
+ position509, tokenIndex509 := position, tokenIndex
+ if buffer[position] != rune('a') {
+ goto l510
+ }
+ position++
+ goto l509
+ l510:
+ position, tokenIndex = position509, tokenIndex509
+ if buffer[position] != rune('A') {
+ goto l493
+ }
+ position++
+ }
+ l509:
+ add(ruleTOCRefHigh, position494)
}
return true
- l491:
- position, tokenIndex = position491, tokenIndex491
+ l493:
+ position, tokenIndex = position493, tokenIndex493
return false
},
/* 37 TOCRefLow <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('l' / 'L')))> */
func() bool {
- position509, tokenIndex509 := position, tokenIndex
+ position511, tokenIndex511 := position, tokenIndex
{
- position510 := position
+ position512 := position
if buffer[position] != rune('.') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('T') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('O') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('C') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('.') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('-') {
- goto l509
+ goto l511
}
position++
{
- position511, tokenIndex511 := position, tokenIndex
+ position513, tokenIndex513 := position, tokenIndex
if buffer[position] != rune('0') {
- goto l512
+ goto l514
}
position++
if buffer[position] != rune('b') {
- goto l512
+ goto l514
}
position++
- goto l511
- l512:
- position, tokenIndex = position511, tokenIndex511
+ goto l513
+ l514:
+ position, tokenIndex = position513, tokenIndex513
if buffer[position] != rune('.') {
- goto l509
+ goto l511
}
position++
if buffer[position] != rune('L') {
- goto l509
+ goto l511
}
position++
{
- position515, tokenIndex515 := position, tokenIndex
+ position517, tokenIndex517 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l516
- }
- position++
- goto l515
- l516:
- position, tokenIndex = position515, tokenIndex515
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l517
- }
- position++
- goto l515
- l517:
- position, tokenIndex = position515, tokenIndex515
- if buffer[position] != rune('_') {
goto l518
}
position++
- goto l515
+ goto l517
l518:
- position, tokenIndex = position515, tokenIndex515
+ position, tokenIndex = position517, tokenIndex517
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l519
+ }
+ position++
+ goto l517
+ l519:
+ position, tokenIndex = position517, tokenIndex517
+ if buffer[position] != rune('_') {
+ goto l520
+ }
+ position++
+ goto l517
+ l520:
+ position, tokenIndex = position517, tokenIndex517
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l509
+ goto l511
}
position++
}
+ l517:
l515:
- l513:
{
- position514, tokenIndex514 := position, tokenIndex
+ position516, tokenIndex516 := position, tokenIndex
{
- position519, tokenIndex519 := position, tokenIndex
+ position521, tokenIndex521 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l520
- }
- position++
- goto l519
- l520:
- position, tokenIndex = position519, tokenIndex519
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l521
- }
- position++
- goto l519
- l521:
- position, tokenIndex = position519, tokenIndex519
- if buffer[position] != rune('_') {
goto l522
}
position++
- goto l519
+ goto l521
l522:
- position, tokenIndex = position519, tokenIndex519
+ position, tokenIndex = position521, tokenIndex521
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l523
+ }
+ position++
+ goto l521
+ l523:
+ position, tokenIndex = position521, tokenIndex521
+ if buffer[position] != rune('_') {
+ goto l524
+ }
+ position++
+ goto l521
+ l524:
+ position, tokenIndex = position521, tokenIndex521
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l514
+ goto l516
}
position++
}
- l519:
- goto l513
- l514:
- position, tokenIndex = position514, tokenIndex514
+ l521:
+ goto l515
+ l516:
+ position, tokenIndex = position516, tokenIndex516
}
}
- l511:
+ l513:
if buffer[position] != rune('@') {
- goto l509
+ goto l511
}
position++
{
- position523, tokenIndex523 := position, tokenIndex
+ position525, tokenIndex525 := position, tokenIndex
if buffer[position] != rune('l') {
- goto l524
+ goto l526
}
position++
- goto l523
- l524:
- position, tokenIndex = position523, tokenIndex523
+ goto l525
+ l526:
+ position, tokenIndex = position525, tokenIndex525
if buffer[position] != rune('L') {
- goto l509
+ goto l511
}
position++
}
- l523:
- add(ruleTOCRefLow, position510)
+ l525:
+ add(ruleTOCRefLow, position512)
}
return true
- l509:
- position, tokenIndex = position509, tokenIndex509
+ l511:
+ position, tokenIndex = position511, tokenIndex511
return false
},
/* 38 IndirectionIndicator <- <'*'> */
func() bool {
- position525, tokenIndex525 := position, tokenIndex
+ position527, tokenIndex527 := position, tokenIndex
{
- position526 := position
+ position528 := position
if buffer[position] != rune('*') {
- goto l525
+ goto l527
}
position++
- add(ruleIndirectionIndicator, position526)
+ add(ruleIndirectionIndicator, position528)
}
return true
- l525:
- position, tokenIndex = position525, tokenIndex525
+ l527:
+ position, tokenIndex = position527, tokenIndex527
return false
},
/* 39 RegisterOrConstant <- <((('%' ([a-z] / [A-Z]) ([a-z] / [A-Z] / ([0-9] / [0-9]))*) / ('$'? ((Offset Offset) / Offset)) / ('#' Offset ('*' [0-9]+ ('-' [0-9] [0-9]*)?)?) / ('#' '~'? '(' [0-9] WS? ('<' '<') WS? [0-9] ')') / ARMRegister) !('f' / 'b' / ':' / '(' / '+' / '-'))> */
func() bool {
- position527, tokenIndex527 := position, tokenIndex
+ position529, tokenIndex529 := position, tokenIndex
{
- position528 := position
+ position530 := position
{
- position529, tokenIndex529 := position, tokenIndex
+ position531, tokenIndex531 := position, tokenIndex
if buffer[position] != rune('%') {
- goto l530
+ goto l532
}
position++
{
- position531, tokenIndex531 := position, tokenIndex
+ position533, tokenIndex533 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
+ goto l534
+ }
+ position++
+ goto l533
+ l534:
+ position, tokenIndex = position533, tokenIndex533
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
goto l532
}
position++
- goto l531
- l532:
- position, tokenIndex = position531, tokenIndex531
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l530
- }
- position++
}
- l531:
l533:
+ l535:
{
- position534, tokenIndex534 := position, tokenIndex
+ position536, tokenIndex536 := position, tokenIndex
{
- position535, tokenIndex535 := position, tokenIndex
+ position537, tokenIndex537 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l536
- }
- position++
- goto l535
- l536:
- position, tokenIndex = position535, tokenIndex535
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l537
- }
- position++
- goto l535
- l537:
- position, tokenIndex = position535, tokenIndex535
- {
- position538, tokenIndex538 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l539
- }
- position++
goto l538
- l539:
- position, tokenIndex = position538, tokenIndex538
+ }
+ position++
+ goto l537
+ l538:
+ position, tokenIndex = position537, tokenIndex537
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l539
+ }
+ position++
+ goto l537
+ l539:
+ position, tokenIndex = position537, tokenIndex537
+ {
+ position540, tokenIndex540 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l534
+ goto l541
+ }
+ position++
+ goto l540
+ l541:
+ position, tokenIndex = position540, tokenIndex540
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l536
}
position++
}
- l538:
+ l540:
}
- l535:
- goto l533
- l534:
- position, tokenIndex = position534, tokenIndex534
+ l537:
+ goto l535
+ l536:
+ position, tokenIndex = position536, tokenIndex536
}
- goto l529
- l530:
- position, tokenIndex = position529, tokenIndex529
- {
- position541, tokenIndex541 := position, tokenIndex
- if buffer[position] != rune('$') {
- goto l541
- }
- position++
- goto l542
- l541:
- position, tokenIndex = position541, tokenIndex541
- }
- l542:
+ goto l531
+ l532:
+ position, tokenIndex = position531, tokenIndex531
{
position543, tokenIndex543 := position, tokenIndex
- if !_rules[ruleOffset]() {
- goto l544
+ if buffer[position] != rune('$') {
+ goto l543
}
- if !_rules[ruleOffset]() {
- goto l544
- }
- goto l543
- l544:
+ position++
+ goto l544
+ l543:
position, tokenIndex = position543, tokenIndex543
+ }
+ l544:
+ {
+ position545, tokenIndex545 := position, tokenIndex
if !_rules[ruleOffset]() {
- goto l540
+ goto l546
+ }
+ if !_rules[ruleOffset]() {
+ goto l546
+ }
+ goto l545
+ l546:
+ position, tokenIndex = position545, tokenIndex545
+ if !_rules[ruleOffset]() {
+ goto l542
}
}
- l543:
- goto l529
- l540:
- position, tokenIndex = position529, tokenIndex529
+ l545:
+ goto l531
+ l542:
+ position, tokenIndex = position531, tokenIndex531
if buffer[position] != rune('#') {
- goto l545
+ goto l547
}
position++
if !_rules[ruleOffset]() {
- goto l545
+ goto l547
}
{
- position546, tokenIndex546 := position, tokenIndex
+ position548, tokenIndex548 := position, tokenIndex
if buffer[position] != rune('*') {
- goto l546
+ goto l548
}
position++
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l546
+ goto l548
}
position++
- l548:
+ l550:
{
- position549, tokenIndex549 := position, tokenIndex
+ position551, tokenIndex551 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l549
+ goto l551
}
position++
- goto l548
- l549:
- position, tokenIndex = position549, tokenIndex549
+ goto l550
+ l551:
+ position, tokenIndex = position551, tokenIndex551
}
{
- position550, tokenIndex550 := position, tokenIndex
+ position552, tokenIndex552 := position, tokenIndex
if buffer[position] != rune('-') {
- goto l550
+ goto l552
}
position++
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l550
+ goto l552
}
position++
- l552:
+ l554:
{
- position553, tokenIndex553 := position, tokenIndex
+ position555, tokenIndex555 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l553
+ goto l555
}
position++
- goto l552
- l553:
- position, tokenIndex = position553, tokenIndex553
+ goto l554
+ l555:
+ position, tokenIndex = position555, tokenIndex555
}
- goto l551
- l550:
- position, tokenIndex = position550, tokenIndex550
+ goto l553
+ l552:
+ position, tokenIndex = position552, tokenIndex552
}
- l551:
- goto l547
- l546:
- position, tokenIndex = position546, tokenIndex546
+ l553:
+ goto l549
+ l548:
+ position, tokenIndex = position548, tokenIndex548
}
+ l549:
+ goto l531
l547:
- goto l529
- l545:
- position, tokenIndex = position529, tokenIndex529
+ position, tokenIndex = position531, tokenIndex531
if buffer[position] != rune('#') {
- goto l554
- }
- position++
- {
- position555, tokenIndex555 := position, tokenIndex
- if buffer[position] != rune('~') {
- goto l555
- }
- position++
goto l556
- l555:
- position, tokenIndex = position555, tokenIndex555
- }
- l556:
- if buffer[position] != rune('(') {
- goto l554
- }
- position++
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l554
}
position++
{
position557, tokenIndex557 := position, tokenIndex
- if !_rules[ruleWS]() {
+ if buffer[position] != rune('~') {
goto l557
}
+ position++
goto l558
l557:
position, tokenIndex = position557, tokenIndex557
}
l558:
- if buffer[position] != rune('<') {
- goto l554
+ if buffer[position] != rune('(') {
+ goto l556
}
position++
- if buffer[position] != rune('<') {
- goto l554
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l556
}
position++
{
@@ -4482,1506 +4471,1514 @@
position, tokenIndex = position559, tokenIndex559
}
l560:
+ if buffer[position] != rune('<') {
+ goto l556
+ }
+ position++
+ if buffer[position] != rune('<') {
+ goto l556
+ }
+ position++
+ {
+ position561, tokenIndex561 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l561
+ }
+ goto l562
+ l561:
+ position, tokenIndex = position561, tokenIndex561
+ }
+ l562:
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l554
+ goto l556
}
position++
if buffer[position] != rune(')') {
- goto l554
+ goto l556
}
position++
- goto l529
- l554:
- position, tokenIndex = position529, tokenIndex529
+ goto l531
+ l556:
+ position, tokenIndex = position531, tokenIndex531
if !_rules[ruleARMRegister]() {
- goto l527
+ goto l529
}
}
- l529:
+ l531:
{
- position561, tokenIndex561 := position, tokenIndex
+ position563, tokenIndex563 := position, tokenIndex
{
- position562, tokenIndex562 := position, tokenIndex
+ position564, tokenIndex564 := position, tokenIndex
if buffer[position] != rune('f') {
- goto l563
- }
- position++
- goto l562
- l563:
- position, tokenIndex = position562, tokenIndex562
- if buffer[position] != rune('b') {
- goto l564
- }
- position++
- goto l562
- l564:
- position, tokenIndex = position562, tokenIndex562
- if buffer[position] != rune(':') {
goto l565
}
position++
- goto l562
+ goto l564
l565:
- position, tokenIndex = position562, tokenIndex562
- if buffer[position] != rune('(') {
+ position, tokenIndex = position564, tokenIndex564
+ if buffer[position] != rune('b') {
goto l566
}
position++
- goto l562
+ goto l564
l566:
- position, tokenIndex = position562, tokenIndex562
- if buffer[position] != rune('+') {
+ position, tokenIndex = position564, tokenIndex564
+ if buffer[position] != rune(':') {
goto l567
}
position++
- goto l562
+ goto l564
l567:
- position, tokenIndex = position562, tokenIndex562
+ position, tokenIndex = position564, tokenIndex564
+ if buffer[position] != rune('(') {
+ goto l568
+ }
+ position++
+ goto l564
+ l568:
+ position, tokenIndex = position564, tokenIndex564
+ if buffer[position] != rune('+') {
+ goto l569
+ }
+ position++
+ goto l564
+ l569:
+ position, tokenIndex = position564, tokenIndex564
if buffer[position] != rune('-') {
- goto l561
+ goto l563
}
position++
}
- l562:
- goto l527
- l561:
- position, tokenIndex = position561, tokenIndex561
+ l564:
+ goto l529
+ l563:
+ position, tokenIndex = position563, tokenIndex563
}
- add(ruleRegisterOrConstant, position528)
+ add(ruleRegisterOrConstant, position530)
}
return true
- l527:
- position, tokenIndex = position527, tokenIndex527
+ l529:
+ position, tokenIndex = position529, tokenIndex529
return false
},
/* 40 ARMConstantTweak <- <(((('l' / 'L') ('s' / 'S') ('l' / 'L')) / (('s' / 'S') ('x' / 'X') ('t' / 'T') ('w' / 'W')) / (('s' / 'S') ('x' / 'X') ('t' / 'T') ('b' / 'B')) / (('u' / 'U') ('x' / 'X') ('t' / 'T') ('w' / 'W')) / (('u' / 'U') ('x' / 'X') ('t' / 'T') ('b' / 'B')) / (('l' / 'L') ('s' / 'S') ('r' / 'R')) / (('r' / 'R') ('o' / 'O') ('r' / 'R')) / (('a' / 'A') ('s' / 'S') ('r' / 'R'))) (WS '#' Offset)?)> */
func() bool {
- position568, tokenIndex568 := position, tokenIndex
+ position570, tokenIndex570 := position, tokenIndex
{
- position569 := position
+ position571 := position
{
- position570, tokenIndex570 := position, tokenIndex
- {
- position572, tokenIndex572 := position, tokenIndex
- if buffer[position] != rune('l') {
- goto l573
- }
- position++
- goto l572
- l573:
- position, tokenIndex = position572, tokenIndex572
- if buffer[position] != rune('L') {
- goto l571
- }
- position++
- }
- l572:
+ position572, tokenIndex572 := position, tokenIndex
{
position574, tokenIndex574 := position, tokenIndex
- if buffer[position] != rune('s') {
+ if buffer[position] != rune('l') {
goto l575
}
position++
goto l574
l575:
position, tokenIndex = position574, tokenIndex574
- if buffer[position] != rune('S') {
- goto l571
+ if buffer[position] != rune('L') {
+ goto l573
}
position++
}
l574:
{
position576, tokenIndex576 := position, tokenIndex
- if buffer[position] != rune('l') {
+ if buffer[position] != rune('s') {
goto l577
}
position++
goto l576
l577:
position, tokenIndex = position576, tokenIndex576
- if buffer[position] != rune('L') {
- goto l571
+ if buffer[position] != rune('S') {
+ goto l573
}
position++
}
l576:
- goto l570
- l571:
- position, tokenIndex = position570, tokenIndex570
{
- position579, tokenIndex579 := position, tokenIndex
- if buffer[position] != rune('s') {
- goto l580
+ position578, tokenIndex578 := position, tokenIndex
+ if buffer[position] != rune('l') {
+ goto l579
}
position++
- goto l579
- l580:
- position, tokenIndex = position579, tokenIndex579
- if buffer[position] != rune('S') {
- goto l578
+ goto l578
+ l579:
+ position, tokenIndex = position578, tokenIndex578
+ if buffer[position] != rune('L') {
+ goto l573
}
position++
}
- l579:
+ l578:
+ goto l572
+ l573:
+ position, tokenIndex = position572, tokenIndex572
{
position581, tokenIndex581 := position, tokenIndex
- if buffer[position] != rune('x') {
+ if buffer[position] != rune('s') {
goto l582
}
position++
goto l581
l582:
position, tokenIndex = position581, tokenIndex581
- if buffer[position] != rune('X') {
- goto l578
+ if buffer[position] != rune('S') {
+ goto l580
}
position++
}
l581:
{
position583, tokenIndex583 := position, tokenIndex
- if buffer[position] != rune('t') {
+ if buffer[position] != rune('x') {
goto l584
}
position++
goto l583
l584:
position, tokenIndex = position583, tokenIndex583
- if buffer[position] != rune('T') {
- goto l578
+ if buffer[position] != rune('X') {
+ goto l580
}
position++
}
l583:
{
position585, tokenIndex585 := position, tokenIndex
- if buffer[position] != rune('w') {
+ if buffer[position] != rune('t') {
goto l586
}
position++
goto l585
l586:
position, tokenIndex = position585, tokenIndex585
- if buffer[position] != rune('W') {
- goto l578
+ if buffer[position] != rune('T') {
+ goto l580
}
position++
}
l585:
- goto l570
- l578:
- position, tokenIndex = position570, tokenIndex570
{
- position588, tokenIndex588 := position, tokenIndex
- if buffer[position] != rune('s') {
- goto l589
+ position587, tokenIndex587 := position, tokenIndex
+ if buffer[position] != rune('w') {
+ goto l588
}
position++
- goto l588
- l589:
- position, tokenIndex = position588, tokenIndex588
- if buffer[position] != rune('S') {
- goto l587
+ goto l587
+ l588:
+ position, tokenIndex = position587, tokenIndex587
+ if buffer[position] != rune('W') {
+ goto l580
}
position++
}
- l588:
+ l587:
+ goto l572
+ l580:
+ position, tokenIndex = position572, tokenIndex572
{
position590, tokenIndex590 := position, tokenIndex
- if buffer[position] != rune('x') {
+ if buffer[position] != rune('s') {
goto l591
}
position++
goto l590
l591:
position, tokenIndex = position590, tokenIndex590
- if buffer[position] != rune('X') {
- goto l587
+ if buffer[position] != rune('S') {
+ goto l589
}
position++
}
l590:
{
position592, tokenIndex592 := position, tokenIndex
- if buffer[position] != rune('t') {
+ if buffer[position] != rune('x') {
goto l593
}
position++
goto l592
l593:
position, tokenIndex = position592, tokenIndex592
- if buffer[position] != rune('T') {
- goto l587
+ if buffer[position] != rune('X') {
+ goto l589
}
position++
}
l592:
{
position594, tokenIndex594 := position, tokenIndex
- if buffer[position] != rune('b') {
+ if buffer[position] != rune('t') {
goto l595
}
position++
goto l594
l595:
position, tokenIndex = position594, tokenIndex594
- if buffer[position] != rune('B') {
- goto l587
+ if buffer[position] != rune('T') {
+ goto l589
}
position++
}
l594:
- goto l570
- l587:
- position, tokenIndex = position570, tokenIndex570
{
- position597, tokenIndex597 := position, tokenIndex
- if buffer[position] != rune('u') {
- goto l598
+ position596, tokenIndex596 := position, tokenIndex
+ if buffer[position] != rune('b') {
+ goto l597
}
position++
- goto l597
- l598:
- position, tokenIndex = position597, tokenIndex597
- if buffer[position] != rune('U') {
- goto l596
+ goto l596
+ l597:
+ position, tokenIndex = position596, tokenIndex596
+ if buffer[position] != rune('B') {
+ goto l589
}
position++
}
- l597:
+ l596:
+ goto l572
+ l589:
+ position, tokenIndex = position572, tokenIndex572
{
position599, tokenIndex599 := position, tokenIndex
- if buffer[position] != rune('x') {
+ if buffer[position] != rune('u') {
goto l600
}
position++
goto l599
l600:
position, tokenIndex = position599, tokenIndex599
- if buffer[position] != rune('X') {
- goto l596
+ if buffer[position] != rune('U') {
+ goto l598
}
position++
}
l599:
{
position601, tokenIndex601 := position, tokenIndex
- if buffer[position] != rune('t') {
+ if buffer[position] != rune('x') {
goto l602
}
position++
goto l601
l602:
position, tokenIndex = position601, tokenIndex601
- if buffer[position] != rune('T') {
- goto l596
+ if buffer[position] != rune('X') {
+ goto l598
}
position++
}
l601:
{
position603, tokenIndex603 := position, tokenIndex
- if buffer[position] != rune('w') {
+ if buffer[position] != rune('t') {
goto l604
}
position++
goto l603
l604:
position, tokenIndex = position603, tokenIndex603
- if buffer[position] != rune('W') {
- goto l596
+ if buffer[position] != rune('T') {
+ goto l598
}
position++
}
l603:
- goto l570
- l596:
- position, tokenIndex = position570, tokenIndex570
{
- position606, tokenIndex606 := position, tokenIndex
- if buffer[position] != rune('u') {
- goto l607
+ position605, tokenIndex605 := position, tokenIndex
+ if buffer[position] != rune('w') {
+ goto l606
}
position++
- goto l606
- l607:
- position, tokenIndex = position606, tokenIndex606
- if buffer[position] != rune('U') {
- goto l605
+ goto l605
+ l606:
+ position, tokenIndex = position605, tokenIndex605
+ if buffer[position] != rune('W') {
+ goto l598
}
position++
}
- l606:
+ l605:
+ goto l572
+ l598:
+ position, tokenIndex = position572, tokenIndex572
{
position608, tokenIndex608 := position, tokenIndex
- if buffer[position] != rune('x') {
+ if buffer[position] != rune('u') {
goto l609
}
position++
goto l608
l609:
position, tokenIndex = position608, tokenIndex608
- if buffer[position] != rune('X') {
- goto l605
+ if buffer[position] != rune('U') {
+ goto l607
}
position++
}
l608:
{
position610, tokenIndex610 := position, tokenIndex
- if buffer[position] != rune('t') {
+ if buffer[position] != rune('x') {
goto l611
}
position++
goto l610
l611:
position, tokenIndex = position610, tokenIndex610
- if buffer[position] != rune('T') {
- goto l605
+ if buffer[position] != rune('X') {
+ goto l607
}
position++
}
l610:
{
position612, tokenIndex612 := position, tokenIndex
- if buffer[position] != rune('b') {
+ if buffer[position] != rune('t') {
goto l613
}
position++
goto l612
l613:
position, tokenIndex = position612, tokenIndex612
- if buffer[position] != rune('B') {
- goto l605
+ if buffer[position] != rune('T') {
+ goto l607
}
position++
}
l612:
- goto l570
- l605:
- position, tokenIndex = position570, tokenIndex570
{
- position615, tokenIndex615 := position, tokenIndex
- if buffer[position] != rune('l') {
- goto l616
+ position614, tokenIndex614 := position, tokenIndex
+ if buffer[position] != rune('b') {
+ goto l615
}
position++
- goto l615
- l616:
- position, tokenIndex = position615, tokenIndex615
- if buffer[position] != rune('L') {
- goto l614
+ goto l614
+ l615:
+ position, tokenIndex = position614, tokenIndex614
+ if buffer[position] != rune('B') {
+ goto l607
}
position++
}
- l615:
+ l614:
+ goto l572
+ l607:
+ position, tokenIndex = position572, tokenIndex572
{
position617, tokenIndex617 := position, tokenIndex
- if buffer[position] != rune('s') {
+ if buffer[position] != rune('l') {
goto l618
}
position++
goto l617
l618:
position, tokenIndex = position617, tokenIndex617
- if buffer[position] != rune('S') {
- goto l614
+ if buffer[position] != rune('L') {
+ goto l616
}
position++
}
l617:
{
position619, tokenIndex619 := position, tokenIndex
- if buffer[position] != rune('r') {
+ if buffer[position] != rune('s') {
goto l620
}
position++
goto l619
l620:
position, tokenIndex = position619, tokenIndex619
- if buffer[position] != rune('R') {
- goto l614
+ if buffer[position] != rune('S') {
+ goto l616
}
position++
}
l619:
- goto l570
- l614:
- position, tokenIndex = position570, tokenIndex570
{
- position622, tokenIndex622 := position, tokenIndex
+ position621, tokenIndex621 := position, tokenIndex
if buffer[position] != rune('r') {
- goto l623
+ goto l622
}
position++
- goto l622
- l623:
- position, tokenIndex = position622, tokenIndex622
+ goto l621
+ l622:
+ position, tokenIndex = position621, tokenIndex621
if buffer[position] != rune('R') {
- goto l621
+ goto l616
}
position++
}
- l622:
+ l621:
+ goto l572
+ l616:
+ position, tokenIndex = position572, tokenIndex572
{
position624, tokenIndex624 := position, tokenIndex
- if buffer[position] != rune('o') {
+ if buffer[position] != rune('r') {
goto l625
}
position++
goto l624
l625:
position, tokenIndex = position624, tokenIndex624
- if buffer[position] != rune('O') {
- goto l621
+ if buffer[position] != rune('R') {
+ goto l623
}
position++
}
l624:
{
position626, tokenIndex626 := position, tokenIndex
- if buffer[position] != rune('r') {
+ if buffer[position] != rune('o') {
goto l627
}
position++
goto l626
l627:
position, tokenIndex = position626, tokenIndex626
- if buffer[position] != rune('R') {
- goto l621
+ if buffer[position] != rune('O') {
+ goto l623
}
position++
}
l626:
- goto l570
- l621:
- position, tokenIndex = position570, tokenIndex570
{
position628, tokenIndex628 := position, tokenIndex
- if buffer[position] != rune('a') {
+ if buffer[position] != rune('r') {
goto l629
}
position++
goto l628
l629:
position, tokenIndex = position628, tokenIndex628
- if buffer[position] != rune('A') {
- goto l568
+ if buffer[position] != rune('R') {
+ goto l623
}
position++
}
l628:
+ goto l572
+ l623:
+ position, tokenIndex = position572, tokenIndex572
{
position630, tokenIndex630 := position, tokenIndex
- if buffer[position] != rune('s') {
+ if buffer[position] != rune('a') {
goto l631
}
position++
goto l630
l631:
position, tokenIndex = position630, tokenIndex630
- if buffer[position] != rune('S') {
- goto l568
+ if buffer[position] != rune('A') {
+ goto l570
}
position++
}
l630:
{
position632, tokenIndex632 := position, tokenIndex
- if buffer[position] != rune('r') {
+ if buffer[position] != rune('s') {
goto l633
}
position++
goto l632
l633:
position, tokenIndex = position632, tokenIndex632
- if buffer[position] != rune('R') {
- goto l568
+ if buffer[position] != rune('S') {
+ goto l570
}
position++
}
l632:
- }
- l570:
- {
- position634, tokenIndex634 := position, tokenIndex
- if !_rules[ruleWS]() {
+ {
+ position634, tokenIndex634 := position, tokenIndex
+ if buffer[position] != rune('r') {
+ goto l635
+ }
+ position++
goto l634
+ l635:
+ position, tokenIndex = position634, tokenIndex634
+ if buffer[position] != rune('R') {
+ goto l570
+ }
+ position++
+ }
+ l634:
+ }
+ l572:
+ {
+ position636, tokenIndex636 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l636
}
if buffer[position] != rune('#') {
- goto l634
+ goto l636
}
position++
if !_rules[ruleOffset]() {
- goto l634
+ goto l636
}
- goto l635
- l634:
- position, tokenIndex = position634, tokenIndex634
+ goto l637
+ l636:
+ position, tokenIndex = position636, tokenIndex636
}
- l635:
- add(ruleARMConstantTweak, position569)
+ l637:
+ add(ruleARMConstantTweak, position571)
}
return true
- l568:
- position, tokenIndex = position568, tokenIndex568
+ l570:
+ position, tokenIndex = position570, tokenIndex570
return false
},
/* 41 ARMRegister <- <((('s' / 'S') ('p' / 'P')) / (('x' / 'w' / 'd' / 'q' / 's') [0-9] [0-9]?) / (('x' / 'X') ('z' / 'Z') ('r' / 'R')) / (('w' / 'W') ('z' / 'Z') ('r' / 'R')) / ARMVectorRegister / ('{' WS? ARMVectorRegister (',' WS? ARMVectorRegister)* WS? '}' ('[' [0-9] [0-9]? ']')?))> */
func() bool {
- position636, tokenIndex636 := position, tokenIndex
+ position638, tokenIndex638 := position, tokenIndex
{
- position637 := position
+ position639 := position
{
- position638, tokenIndex638 := position, tokenIndex
- {
- position640, tokenIndex640 := position, tokenIndex
- if buffer[position] != rune('s') {
- goto l641
- }
- position++
- goto l640
- l641:
- position, tokenIndex = position640, tokenIndex640
- if buffer[position] != rune('S') {
- goto l639
- }
- position++
- }
- l640:
+ position640, tokenIndex640 := position, tokenIndex
{
position642, tokenIndex642 := position, tokenIndex
- if buffer[position] != rune('p') {
+ if buffer[position] != rune('s') {
goto l643
}
position++
goto l642
l643:
position, tokenIndex = position642, tokenIndex642
- if buffer[position] != rune('P') {
- goto l639
+ if buffer[position] != rune('S') {
+ goto l641
}
position++
}
l642:
- goto l638
- l639:
- position, tokenIndex = position638, tokenIndex638
{
- position645, tokenIndex645 := position, tokenIndex
+ position644, tokenIndex644 := position, tokenIndex
+ if buffer[position] != rune('p') {
+ goto l645
+ }
+ position++
+ goto l644
+ l645:
+ position, tokenIndex = position644, tokenIndex644
+ if buffer[position] != rune('P') {
+ goto l641
+ }
+ position++
+ }
+ l644:
+ goto l640
+ l641:
+ position, tokenIndex = position640, tokenIndex640
+ {
+ position647, tokenIndex647 := position, tokenIndex
if buffer[position] != rune('x') {
- goto l646
- }
- position++
- goto l645
- l646:
- position, tokenIndex = position645, tokenIndex645
- if buffer[position] != rune('w') {
- goto l647
- }
- position++
- goto l645
- l647:
- position, tokenIndex = position645, tokenIndex645
- if buffer[position] != rune('d') {
goto l648
}
position++
- goto l645
+ goto l647
l648:
- position, tokenIndex = position645, tokenIndex645
- if buffer[position] != rune('q') {
+ position, tokenIndex = position647, tokenIndex647
+ if buffer[position] != rune('w') {
goto l649
}
position++
- goto l645
+ goto l647
l649:
- position, tokenIndex = position645, tokenIndex645
- if buffer[position] != rune('s') {
- goto l644
- }
- position++
- }
- l645:
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l644
- }
- position++
- {
- position650, tokenIndex650 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
+ position, tokenIndex = position647, tokenIndex647
+ if buffer[position] != rune('d') {
goto l650
}
position++
- goto l651
+ goto l647
l650:
- position, tokenIndex = position650, tokenIndex650
- }
- l651:
- goto l638
- l644:
- position, tokenIndex = position638, tokenIndex638
- {
- position653, tokenIndex653 := position, tokenIndex
- if buffer[position] != rune('x') {
- goto l654
+ position, tokenIndex = position647, tokenIndex647
+ if buffer[position] != rune('q') {
+ goto l651
}
position++
- goto l653
- l654:
- position, tokenIndex = position653, tokenIndex653
- if buffer[position] != rune('X') {
+ goto l647
+ l651:
+ position, tokenIndex = position647, tokenIndex647
+ if buffer[position] != rune('s') {
+ goto l646
+ }
+ position++
+ }
+ l647:
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l646
+ }
+ position++
+ {
+ position652, tokenIndex652 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l652
}
position++
+ goto l653
+ l652:
+ position, tokenIndex = position652, tokenIndex652
}
l653:
+ goto l640
+ l646:
+ position, tokenIndex = position640, tokenIndex640
{
position655, tokenIndex655 := position, tokenIndex
- if buffer[position] != rune('z') {
+ if buffer[position] != rune('x') {
goto l656
}
position++
goto l655
l656:
position, tokenIndex = position655, tokenIndex655
- if buffer[position] != rune('Z') {
- goto l652
+ if buffer[position] != rune('X') {
+ goto l654
}
position++
}
l655:
{
position657, tokenIndex657 := position, tokenIndex
- if buffer[position] != rune('r') {
+ if buffer[position] != rune('z') {
goto l658
}
position++
goto l657
l658:
position, tokenIndex = position657, tokenIndex657
- if buffer[position] != rune('R') {
- goto l652
+ if buffer[position] != rune('Z') {
+ goto l654
}
position++
}
l657:
- goto l638
- l652:
- position, tokenIndex = position638, tokenIndex638
{
- position660, tokenIndex660 := position, tokenIndex
- if buffer[position] != rune('w') {
- goto l661
+ position659, tokenIndex659 := position, tokenIndex
+ if buffer[position] != rune('r') {
+ goto l660
}
position++
- goto l660
- l661:
- position, tokenIndex = position660, tokenIndex660
- if buffer[position] != rune('W') {
- goto l659
+ goto l659
+ l660:
+ position, tokenIndex = position659, tokenIndex659
+ if buffer[position] != rune('R') {
+ goto l654
}
position++
}
- l660:
+ l659:
+ goto l640
+ l654:
+ position, tokenIndex = position640, tokenIndex640
{
position662, tokenIndex662 := position, tokenIndex
- if buffer[position] != rune('z') {
+ if buffer[position] != rune('w') {
goto l663
}
position++
goto l662
l663:
position, tokenIndex = position662, tokenIndex662
- if buffer[position] != rune('Z') {
- goto l659
+ if buffer[position] != rune('W') {
+ goto l661
}
position++
}
l662:
{
position664, tokenIndex664 := position, tokenIndex
- if buffer[position] != rune('r') {
+ if buffer[position] != rune('z') {
goto l665
}
position++
goto l664
l665:
position, tokenIndex = position664, tokenIndex664
- if buffer[position] != rune('R') {
- goto l659
+ if buffer[position] != rune('Z') {
+ goto l661
}
position++
}
l664:
- goto l638
- l659:
- position, tokenIndex = position638, tokenIndex638
- if !_rules[ruleARMVectorRegister]() {
- goto l666
- }
- goto l638
- l666:
- position, tokenIndex = position638, tokenIndex638
- if buffer[position] != rune('{') {
- goto l636
- }
- position++
{
- position667, tokenIndex667 := position, tokenIndex
- if !_rules[ruleWS]() {
+ position666, tokenIndex666 := position, tokenIndex
+ if buffer[position] != rune('r') {
goto l667
}
- goto l668
+ position++
+ goto l666
l667:
- position, tokenIndex = position667, tokenIndex667
- }
- l668:
- if !_rules[ruleARMVectorRegister]() {
- goto l636
- }
- l669:
- {
- position670, tokenIndex670 := position, tokenIndex
- if buffer[position] != rune(',') {
- goto l670
+ position, tokenIndex = position666, tokenIndex666
+ if buffer[position] != rune('R') {
+ goto l661
}
position++
- {
- position671, tokenIndex671 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l671
- }
- goto l672
- l671:
- position, tokenIndex = position671, tokenIndex671
- }
- l672:
- if !_rules[ruleARMVectorRegister]() {
- goto l670
- }
- goto l669
- l670:
- position, tokenIndex = position670, tokenIndex670
}
- {
- position673, tokenIndex673 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l673
- }
- goto l674
- l673:
- position, tokenIndex = position673, tokenIndex673
+ l666:
+ goto l640
+ l661:
+ position, tokenIndex = position640, tokenIndex640
+ if !_rules[ruleARMVectorRegister]() {
+ goto l668
}
- l674:
- if buffer[position] != rune('}') {
- goto l636
+ goto l640
+ l668:
+ position, tokenIndex = position640, tokenIndex640
+ if buffer[position] != rune('{') {
+ goto l638
}
position++
{
- position675, tokenIndex675 := position, tokenIndex
- if buffer[position] != rune('[') {
- goto l675
+ position669, tokenIndex669 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l669
}
- position++
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l675
+ goto l670
+ l669:
+ position, tokenIndex = position669, tokenIndex669
+ }
+ l670:
+ if !_rules[ruleARMVectorRegister]() {
+ goto l638
+ }
+ l671:
+ {
+ position672, tokenIndex672 := position, tokenIndex
+ if buffer[position] != rune(',') {
+ goto l672
}
position++
{
- position677, tokenIndex677 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l677
+ position673, tokenIndex673 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l673
}
- position++
- goto l678
- l677:
- position, tokenIndex = position677, tokenIndex677
+ goto l674
+ l673:
+ position, tokenIndex = position673, tokenIndex673
}
- l678:
- if buffer[position] != rune(']') {
+ l674:
+ if !_rules[ruleARMVectorRegister]() {
+ goto l672
+ }
+ goto l671
+ l672:
+ position, tokenIndex = position672, tokenIndex672
+ }
+ {
+ position675, tokenIndex675 := position, tokenIndex
+ if !_rules[ruleWS]() {
goto l675
}
- position++
goto l676
l675:
position, tokenIndex = position675, tokenIndex675
}
l676:
+ if buffer[position] != rune('}') {
+ goto l638
+ }
+ position++
+ {
+ position677, tokenIndex677 := position, tokenIndex
+ if buffer[position] != rune('[') {
+ goto l677
+ }
+ position++
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l677
+ }
+ position++
+ {
+ position679, tokenIndex679 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l679
+ }
+ position++
+ goto l680
+ l679:
+ position, tokenIndex = position679, tokenIndex679
+ }
+ l680:
+ if buffer[position] != rune(']') {
+ goto l677
+ }
+ position++
+ goto l678
+ l677:
+ position, tokenIndex = position677, tokenIndex677
+ }
+ l678:
}
- l638:
- add(ruleARMRegister, position637)
+ l640:
+ add(ruleARMRegister, position639)
}
return true
- l636:
- position, tokenIndex = position636, tokenIndex636
+ l638:
+ position, tokenIndex = position638, tokenIndex638
return false
},
/* 42 ARMVectorRegister <- <(('v' / 'V') [0-9] [0-9]? ('.' [0-9]* ('b' / 's' / 'd' / 'h' / 'q') ('[' [0-9] [0-9]? ']')?)?)> */
func() bool {
- position679, tokenIndex679 := position, tokenIndex
+ position681, tokenIndex681 := position, tokenIndex
{
- position680 := position
+ position682 := position
{
- position681, tokenIndex681 := position, tokenIndex
+ position683, tokenIndex683 := position, tokenIndex
if buffer[position] != rune('v') {
- goto l682
+ goto l684
}
position++
- goto l681
- l682:
- position, tokenIndex = position681, tokenIndex681
+ goto l683
+ l684:
+ position, tokenIndex = position683, tokenIndex683
if buffer[position] != rune('V') {
- goto l679
+ goto l681
}
position++
}
- l681:
+ l683:
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l679
+ goto l681
}
position++
{
- position683, tokenIndex683 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l683
- }
- position++
- goto l684
- l683:
- position, tokenIndex = position683, tokenIndex683
- }
- l684:
- {
position685, tokenIndex685 := position, tokenIndex
- if buffer[position] != rune('.') {
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
goto l685
}
position++
- l687:
- {
- position688, tokenIndex688 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l688
- }
- position++
- goto l687
- l688:
- position, tokenIndex = position688, tokenIndex688
- }
- {
- position689, tokenIndex689 := position, tokenIndex
- if buffer[position] != rune('b') {
- goto l690
- }
- position++
- goto l689
- l690:
- position, tokenIndex = position689, tokenIndex689
- if buffer[position] != rune('s') {
- goto l691
- }
- position++
- goto l689
- l691:
- position, tokenIndex = position689, tokenIndex689
- if buffer[position] != rune('d') {
- goto l692
- }
- position++
- goto l689
- l692:
- position, tokenIndex = position689, tokenIndex689
- if buffer[position] != rune('h') {
- goto l693
- }
- position++
- goto l689
- l693:
- position, tokenIndex = position689, tokenIndex689
- if buffer[position] != rune('q') {
- goto l685
- }
- position++
- }
- l689:
- {
- position694, tokenIndex694 := position, tokenIndex
- if buffer[position] != rune('[') {
- goto l694
- }
- position++
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l694
- }
- position++
- {
- position696, tokenIndex696 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l696
- }
- position++
- goto l697
- l696:
- position, tokenIndex = position696, tokenIndex696
- }
- l697:
- if buffer[position] != rune(']') {
- goto l694
- }
- position++
- goto l695
- l694:
- position, tokenIndex = position694, tokenIndex694
- }
- l695:
goto l686
l685:
position, tokenIndex = position685, tokenIndex685
}
l686:
- add(ruleARMVectorRegister, position680)
+ {
+ position687, tokenIndex687 := position, tokenIndex
+ if buffer[position] != rune('.') {
+ goto l687
+ }
+ position++
+ l689:
+ {
+ position690, tokenIndex690 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l690
+ }
+ position++
+ goto l689
+ l690:
+ position, tokenIndex = position690, tokenIndex690
+ }
+ {
+ position691, tokenIndex691 := position, tokenIndex
+ if buffer[position] != rune('b') {
+ goto l692
+ }
+ position++
+ goto l691
+ l692:
+ position, tokenIndex = position691, tokenIndex691
+ if buffer[position] != rune('s') {
+ goto l693
+ }
+ position++
+ goto l691
+ l693:
+ position, tokenIndex = position691, tokenIndex691
+ if buffer[position] != rune('d') {
+ goto l694
+ }
+ position++
+ goto l691
+ l694:
+ position, tokenIndex = position691, tokenIndex691
+ if buffer[position] != rune('h') {
+ goto l695
+ }
+ position++
+ goto l691
+ l695:
+ position, tokenIndex = position691, tokenIndex691
+ if buffer[position] != rune('q') {
+ goto l687
+ }
+ position++
+ }
+ l691:
+ {
+ position696, tokenIndex696 := position, tokenIndex
+ if buffer[position] != rune('[') {
+ goto l696
+ }
+ position++
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l696
+ }
+ position++
+ {
+ position698, tokenIndex698 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l698
+ }
+ position++
+ goto l699
+ l698:
+ position, tokenIndex = position698, tokenIndex698
+ }
+ l699:
+ if buffer[position] != rune(']') {
+ goto l696
+ }
+ position++
+ goto l697
+ l696:
+ position, tokenIndex = position696, tokenIndex696
+ }
+ l697:
+ goto l688
+ l687:
+ position, tokenIndex = position687, tokenIndex687
+ }
+ l688:
+ add(ruleARMVectorRegister, position682)
}
return true
- l679:
- position, tokenIndex = position679, tokenIndex679
+ l681:
+ position, tokenIndex = position681, tokenIndex681
return false
},
/* 43 MemoryRef <- <((SymbolRef BaseIndexScale) / SymbolRef / Low12BitsSymbolRef / (Offset* BaseIndexScale) / (SegmentRegister Offset BaseIndexScale) / (SegmentRegister BaseIndexScale) / (SegmentRegister Offset) / ARMBaseIndexScale / BaseIndexScale)> */
func() bool {
- position698, tokenIndex698 := position, tokenIndex
+ position700, tokenIndex700 := position, tokenIndex
{
- position699 := position
+ position701 := position
{
- position700, tokenIndex700 := position, tokenIndex
+ position702, tokenIndex702 := position, tokenIndex
if !_rules[ruleSymbolRef]() {
- goto l701
- }
- if !_rules[ruleBaseIndexScale]() {
- goto l701
- }
- goto l700
- l701:
- position, tokenIndex = position700, tokenIndex700
- if !_rules[ruleSymbolRef]() {
- goto l702
- }
- goto l700
- l702:
- position, tokenIndex = position700, tokenIndex700
- if !_rules[ruleLow12BitsSymbolRef]() {
goto l703
}
- goto l700
- l703:
- position, tokenIndex = position700, tokenIndex700
- l705:
- {
- position706, tokenIndex706 := position, tokenIndex
- if !_rules[ruleOffset]() {
- goto l706
- }
- goto l705
- l706:
- position, tokenIndex = position706, tokenIndex706
- }
if !_rules[ruleBaseIndexScale]() {
+ goto l703
+ }
+ goto l702
+ l703:
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleSymbolRef]() {
goto l704
}
- goto l700
+ goto l702
l704:
- position, tokenIndex = position700, tokenIndex700
- if !_rules[ruleSegmentRegister]() {
- goto l707
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleLow12BitsSymbolRef]() {
+ goto l705
}
- if !_rules[ruleOffset]() {
- goto l707
- }
- if !_rules[ruleBaseIndexScale]() {
- goto l707
- }
- goto l700
+ goto l702
+ l705:
+ position, tokenIndex = position702, tokenIndex702
l707:
- position, tokenIndex = position700, tokenIndex700
- if !_rules[ruleSegmentRegister]() {
- goto l708
+ {
+ position708, tokenIndex708 := position, tokenIndex
+ if !_rules[ruleOffset]() {
+ goto l708
+ }
+ goto l707
+ l708:
+ position, tokenIndex = position708, tokenIndex708
}
if !_rules[ruleBaseIndexScale]() {
- goto l708
+ goto l706
}
- goto l700
- l708:
- position, tokenIndex = position700, tokenIndex700
+ goto l702
+ l706:
+ position, tokenIndex = position702, tokenIndex702
if !_rules[ruleSegmentRegister]() {
goto l709
}
if !_rules[ruleOffset]() {
goto l709
}
- goto l700
+ if !_rules[ruleBaseIndexScale]() {
+ goto l709
+ }
+ goto l702
l709:
- position, tokenIndex = position700, tokenIndex700
- if !_rules[ruleARMBaseIndexScale]() {
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleSegmentRegister]() {
goto l710
}
- goto l700
- l710:
- position, tokenIndex = position700, tokenIndex700
if !_rules[ruleBaseIndexScale]() {
- goto l698
+ goto l710
+ }
+ goto l702
+ l710:
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleSegmentRegister]() {
+ goto l711
+ }
+ if !_rules[ruleOffset]() {
+ goto l711
+ }
+ goto l702
+ l711:
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleARMBaseIndexScale]() {
+ goto l712
+ }
+ goto l702
+ l712:
+ position, tokenIndex = position702, tokenIndex702
+ if !_rules[ruleBaseIndexScale]() {
+ goto l700
}
}
- l700:
- add(ruleMemoryRef, position699)
+ l702:
+ add(ruleMemoryRef, position701)
}
return true
- l698:
- position, tokenIndex = position698, tokenIndex698
+ l700:
+ position, tokenIndex = position700, tokenIndex700
return false
},
/* 44 SymbolRef <- <((Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?)> */
func() bool {
- position711, tokenIndex711 := position, tokenIndex
+ position713, tokenIndex713 := position, tokenIndex
{
- position712 := position
+ position714 := position
{
- position713, tokenIndex713 := position, tokenIndex
- l715:
+ position715, tokenIndex715 := position, tokenIndex
+ l717:
{
- position716, tokenIndex716 := position, tokenIndex
+ position718, tokenIndex718 := position, tokenIndex
if !_rules[ruleOffset]() {
- goto l716
+ goto l718
}
- goto l715
- l716:
- position, tokenIndex = position716, tokenIndex716
+ goto l717
+ l718:
+ position, tokenIndex = position718, tokenIndex718
}
if buffer[position] != rune('+') {
- goto l713
+ goto l715
}
position++
- goto l714
- l713:
- position, tokenIndex = position713, tokenIndex713
+ goto l716
+ l715:
+ position, tokenIndex = position715, tokenIndex715
}
- l714:
+ l716:
{
- position717, tokenIndex717 := position, tokenIndex
+ position719, tokenIndex719 := position, tokenIndex
if !_rules[ruleLocalSymbol]() {
- goto l718
- }
- goto l717
- l718:
- position, tokenIndex = position717, tokenIndex717
- if !_rules[ruleSymbolName]() {
- goto l711
- }
- }
- l717:
- l719:
- {
- position720, tokenIndex720 := position, tokenIndex
- if !_rules[ruleOffset]() {
goto l720
}
goto l719
l720:
- position, tokenIndex = position720, tokenIndex720
+ position, tokenIndex = position719, tokenIndex719
+ if !_rules[ruleSymbolName]() {
+ goto l713
+ }
+ }
+ l719:
+ l721:
+ {
+ position722, tokenIndex722 := position, tokenIndex
+ if !_rules[ruleOffset]() {
+ goto l722
+ }
+ goto l721
+ l722:
+ position, tokenIndex = position722, tokenIndex722
}
{
- position721, tokenIndex721 := position, tokenIndex
+ position723, tokenIndex723 := position, tokenIndex
if buffer[position] != rune('@') {
- goto l721
+ goto l723
}
position++
if !_rules[ruleSection]() {
- goto l721
- }
- l723:
- {
- position724, tokenIndex724 := position, tokenIndex
- if !_rules[ruleOffset]() {
- goto l724
- }
goto l723
- l724:
- position, tokenIndex = position724, tokenIndex724
}
- goto l722
- l721:
- position, tokenIndex = position721, tokenIndex721
+ l725:
+ {
+ position726, tokenIndex726 := position, tokenIndex
+ if !_rules[ruleOffset]() {
+ goto l726
+ }
+ goto l725
+ l726:
+ position, tokenIndex = position726, tokenIndex726
+ }
+ goto l724
+ l723:
+ position, tokenIndex = position723, tokenIndex723
}
- l722:
- add(ruleSymbolRef, position712)
+ l724:
+ add(ruleSymbolRef, position714)
}
return true
- l711:
- position, tokenIndex = position711, tokenIndex711
+ l713:
+ position, tokenIndex = position713, tokenIndex713
return false
},
/* 45 Low12BitsSymbolRef <- <(':' ('l' / 'L') ('o' / 'O') '1' '2' ':' (LocalSymbol / SymbolName) Offset?)> */
func() bool {
- position725, tokenIndex725 := position, tokenIndex
+ position727, tokenIndex727 := position, tokenIndex
{
- position726 := position
+ position728 := position
if buffer[position] != rune(':') {
- goto l725
+ goto l727
}
position++
{
- position727, tokenIndex727 := position, tokenIndex
- if buffer[position] != rune('l') {
- goto l728
- }
- position++
- goto l727
- l728:
- position, tokenIndex = position727, tokenIndex727
- if buffer[position] != rune('L') {
- goto l725
- }
- position++
- }
- l727:
- {
position729, tokenIndex729 := position, tokenIndex
- if buffer[position] != rune('o') {
+ if buffer[position] != rune('l') {
goto l730
}
position++
goto l729
l730:
position, tokenIndex = position729, tokenIndex729
- if buffer[position] != rune('O') {
- goto l725
+ if buffer[position] != rune('L') {
+ goto l727
}
position++
}
l729:
- if buffer[position] != rune('1') {
- goto l725
- }
- position++
- if buffer[position] != rune('2') {
- goto l725
- }
- position++
- if buffer[position] != rune(':') {
- goto l725
- }
- position++
{
position731, tokenIndex731 := position, tokenIndex
- if !_rules[ruleLocalSymbol]() {
+ if buffer[position] != rune('o') {
goto l732
}
+ position++
goto l731
l732:
position, tokenIndex = position731, tokenIndex731
- if !_rules[ruleSymbolName]() {
- goto l725
+ if buffer[position] != rune('O') {
+ goto l727
}
+ position++
}
l731:
+ if buffer[position] != rune('1') {
+ goto l727
+ }
+ position++
+ if buffer[position] != rune('2') {
+ goto l727
+ }
+ position++
+ if buffer[position] != rune(':') {
+ goto l727
+ }
+ position++
{
position733, tokenIndex733 := position, tokenIndex
- if !_rules[ruleOffset]() {
- goto l733
+ if !_rules[ruleLocalSymbol]() {
+ goto l734
}
- goto l734
- l733:
+ goto l733
+ l734:
position, tokenIndex = position733, tokenIndex733
+ if !_rules[ruleSymbolName]() {
+ goto l727
+ }
}
- l734:
- add(ruleLow12BitsSymbolRef, position726)
+ l733:
+ {
+ position735, tokenIndex735 := position, tokenIndex
+ if !_rules[ruleOffset]() {
+ goto l735
+ }
+ goto l736
+ l735:
+ position, tokenIndex = position735, tokenIndex735
+ }
+ l736:
+ add(ruleLow12BitsSymbolRef, position728)
}
return true
- l725:
- position, tokenIndex = position725, tokenIndex725
+ l727:
+ position, tokenIndex = position727, tokenIndex727
return false
},
/* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
func() bool {
- position735, tokenIndex735 := position, tokenIndex
+ position737, tokenIndex737 := position, tokenIndex
{
- position736 := position
+ position738 := position
if buffer[position] != rune('[') {
- goto l735
+ goto l737
}
position++
if !_rules[ruleARMRegister]() {
- goto l735
+ goto l737
}
{
- position737, tokenIndex737 := position, tokenIndex
+ position739, tokenIndex739 := position, tokenIndex
if buffer[position] != rune(',') {
- goto l737
+ goto l739
}
position++
{
- position739, tokenIndex739 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l739
- }
- goto l740
- l739:
- position, tokenIndex = position739, tokenIndex739
- }
- l740:
- {
position741, tokenIndex741 := position, tokenIndex
+ if !_rules[ruleWS]() {
+ goto l741
+ }
+ goto l742
+ l741:
+ position, tokenIndex = position741, tokenIndex741
+ }
+ l742:
+ {
+ position743, tokenIndex743 := position, tokenIndex
if buffer[position] != rune('#') {
- goto l742
+ goto l744
}
position++
if !_rules[ruleOffset]() {
- goto l742
+ goto l744
}
{
- position743, tokenIndex743 := position, tokenIndex
+ position745, tokenIndex745 := position, tokenIndex
if buffer[position] != rune('*') {
- goto l743
+ goto l745
}
position++
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l743
+ goto l745
}
position++
- l745:
+ l747:
{
- position746, tokenIndex746 := position, tokenIndex
+ position748, tokenIndex748 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l746
+ goto l748
}
position++
- goto l745
- l746:
- position, tokenIndex = position746, tokenIndex746
+ goto l747
+ l748:
+ position, tokenIndex = position748, tokenIndex748
}
- goto l744
- l743:
- position, tokenIndex = position743, tokenIndex743
+ goto l746
+ l745:
+ position, tokenIndex = position745, tokenIndex745
}
+ l746:
+ goto l743
l744:
- goto l741
- l742:
- position, tokenIndex = position741, tokenIndex741
+ position, tokenIndex = position743, tokenIndex743
if !_rules[ruleARMGOTLow12]() {
- goto l747
+ goto l749
}
- goto l741
- l747:
- position, tokenIndex = position741, tokenIndex741
+ goto l743
+ l749:
+ position, tokenIndex = position743, tokenIndex743
if !_rules[ruleLow12BitsSymbolRef]() {
- goto l748
+ goto l750
}
- goto l741
- l748:
- position, tokenIndex = position741, tokenIndex741
+ goto l743
+ l750:
+ position, tokenIndex = position743, tokenIndex743
if !_rules[ruleARMRegister]() {
- goto l737
+ goto l739
}
}
- l741:
+ l743:
{
- position749, tokenIndex749 := position, tokenIndex
+ position751, tokenIndex751 := position, tokenIndex
if buffer[position] != rune(',') {
- goto l749
+ goto l751
}
position++
{
- position751, tokenIndex751 := position, tokenIndex
+ position753, tokenIndex753 := position, tokenIndex
if !_rules[ruleWS]() {
- goto l751
+ goto l753
}
- goto l752
- l751:
- position, tokenIndex = position751, tokenIndex751
+ goto l754
+ l753:
+ position, tokenIndex = position753, tokenIndex753
}
- l752:
+ l754:
if !_rules[ruleARMConstantTweak]() {
- goto l749
+ goto l751
}
- goto l750
- l749:
- position, tokenIndex = position749, tokenIndex749
+ goto l752
+ l751:
+ position, tokenIndex = position751, tokenIndex751
}
- l750:
- goto l738
- l737:
- position, tokenIndex = position737, tokenIndex737
+ l752:
+ goto l740
+ l739:
+ position, tokenIndex = position739, tokenIndex739
}
- l738:
+ l740:
if buffer[position] != rune(']') {
- goto l735
+ goto l737
}
position++
{
- position753, tokenIndex753 := position, tokenIndex
+ position755, tokenIndex755 := position, tokenIndex
if !_rules[ruleARMPostincrement]() {
- goto l753
+ goto l755
}
- goto l754
- l753:
- position, tokenIndex = position753, tokenIndex753
+ goto l756
+ l755:
+ position, tokenIndex = position755, tokenIndex755
}
- l754:
- add(ruleARMBaseIndexScale, position736)
+ l756:
+ add(ruleARMBaseIndexScale, position738)
}
return true
- l735:
- position, tokenIndex = position735, tokenIndex735
+ l737:
+ position, tokenIndex = position737, tokenIndex737
return false
},
/* 47 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */
func() bool {
- position755, tokenIndex755 := position, tokenIndex
+ position757, tokenIndex757 := position, tokenIndex
{
- position756 := position
+ position758 := position
if buffer[position] != rune(':') {
- goto l755
+ goto l757
}
position++
{
- position757, tokenIndex757 := position, tokenIndex
- if buffer[position] != rune('g') {
- goto l758
- }
- position++
- goto l757
- l758:
- position, tokenIndex = position757, tokenIndex757
- if buffer[position] != rune('G') {
- goto l755
- }
- position++
- }
- l757:
- {
position759, tokenIndex759 := position, tokenIndex
- if buffer[position] != rune('o') {
+ if buffer[position] != rune('g') {
goto l760
}
position++
goto l759
l760:
position, tokenIndex = position759, tokenIndex759
- if buffer[position] != rune('O') {
- goto l755
+ if buffer[position] != rune('G') {
+ goto l757
}
position++
}
l759:
{
position761, tokenIndex761 := position, tokenIndex
- if buffer[position] != rune('t') {
+ if buffer[position] != rune('o') {
goto l762
}
position++
goto l761
l762:
position, tokenIndex = position761, tokenIndex761
- if buffer[position] != rune('T') {
- goto l755
+ if buffer[position] != rune('O') {
+ goto l757
}
position++
}
l761:
- if buffer[position] != rune('_') {
- goto l755
- }
- position++
{
position763, tokenIndex763 := position, tokenIndex
- if buffer[position] != rune('l') {
+ if buffer[position] != rune('t') {
goto l764
}
position++
goto l763
l764:
position, tokenIndex = position763, tokenIndex763
- if buffer[position] != rune('L') {
- goto l755
+ if buffer[position] != rune('T') {
+ goto l757
}
position++
}
l763:
+ if buffer[position] != rune('_') {
+ goto l757
+ }
+ position++
{
position765, tokenIndex765 := position, tokenIndex
- if buffer[position] != rune('o') {
+ if buffer[position] != rune('l') {
goto l766
}
position++
goto l765
l766:
position, tokenIndex = position765, tokenIndex765
- if buffer[position] != rune('O') {
- goto l755
+ if buffer[position] != rune('L') {
+ goto l757
}
position++
}
l765:
+ {
+ position767, tokenIndex767 := position, tokenIndex
+ if buffer[position] != rune('o') {
+ goto l768
+ }
+ position++
+ goto l767
+ l768:
+ position, tokenIndex = position767, tokenIndex767
+ if buffer[position] != rune('O') {
+ goto l757
+ }
+ position++
+ }
+ l767:
if buffer[position] != rune('1') {
- goto l755
+ goto l757
}
position++
if buffer[position] != rune('2') {
- goto l755
+ goto l757
}
position++
if buffer[position] != rune(':') {
- goto l755
+ goto l757
}
position++
if !_rules[ruleSymbolName]() {
- goto l755
+ goto l757
}
- add(ruleARMGOTLow12, position756)
+ add(ruleARMGOTLow12, position758)
}
return true
- l755:
- position, tokenIndex = position755, tokenIndex755
+ l757:
+ position, tokenIndex = position757, tokenIndex757
return false
},
/* 48 ARMPostincrement <- <'!'> */
func() bool {
- position767, tokenIndex767 := position, tokenIndex
+ position769, tokenIndex769 := position, tokenIndex
{
- position768 := position
+ position770 := position
if buffer[position] != rune('!') {
- goto l767
+ goto l769
}
position++
- add(ruleARMPostincrement, position768)
+ add(ruleARMPostincrement, position770)
}
return true
- l767:
- position, tokenIndex = position767, tokenIndex767
+ l769:
+ position, tokenIndex = position769, tokenIndex769
return false
},
/* 49 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */
func() bool {
- position769, tokenIndex769 := position, tokenIndex
+ position771, tokenIndex771 := position, tokenIndex
{
- position770 := position
+ position772 := position
if buffer[position] != rune('(') {
- goto l769
+ goto l771
}
position++
{
- position771, tokenIndex771 := position, tokenIndex
- if !_rules[ruleRegisterOrConstant]() {
- goto l771
- }
- goto l772
- l771:
- position, tokenIndex = position771, tokenIndex771
- }
- l772:
- {
position773, tokenIndex773 := position, tokenIndex
- if !_rules[ruleWS]() {
+ if !_rules[ruleRegisterOrConstant]() {
goto l773
}
goto l774
@@ -5991,24 +5988,21 @@
l774:
{
position775, tokenIndex775 := position, tokenIndex
- if buffer[position] != rune(',') {
+ if !_rules[ruleWS]() {
goto l775
}
+ goto l776
+ l775:
+ position, tokenIndex = position775, tokenIndex775
+ }
+ l776:
+ {
+ position777, tokenIndex777 := position, tokenIndex
+ if buffer[position] != rune(',') {
+ goto l777
+ }
position++
{
- position777, tokenIndex777 := position, tokenIndex
- if !_rules[ruleWS]() {
- goto l777
- }
- goto l778
- l777:
- position, tokenIndex = position777, tokenIndex777
- }
- l778:
- if !_rules[ruleRegisterOrConstant]() {
- goto l775
- }
- {
position779, tokenIndex779 := position, tokenIndex
if !_rules[ruleWS]() {
goto l779
@@ -6018,94 +6012,96 @@
position, tokenIndex = position779, tokenIndex779
}
l780:
+ if !_rules[ruleRegisterOrConstant]() {
+ goto l777
+ }
{
position781, tokenIndex781 := position, tokenIndex
- if buffer[position] != rune(',') {
+ if !_rules[ruleWS]() {
goto l781
}
- position++
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l781
- }
- position++
- l783:
- {
- position784, tokenIndex784 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l784
- }
- position++
- goto l783
- l784:
- position, tokenIndex = position784, tokenIndex784
- }
goto l782
l781:
position, tokenIndex = position781, tokenIndex781
}
l782:
- goto l776
- l775:
- position, tokenIndex = position775, tokenIndex775
+ {
+ position783, tokenIndex783 := position, tokenIndex
+ if buffer[position] != rune(',') {
+ goto l783
+ }
+ position++
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l783
+ }
+ position++
+ l785:
+ {
+ position786, tokenIndex786 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l786
+ }
+ position++
+ goto l785
+ l786:
+ position, tokenIndex = position786, tokenIndex786
+ }
+ goto l784
+ l783:
+ position, tokenIndex = position783, tokenIndex783
+ }
+ l784:
+ goto l778
+ l777:
+ position, tokenIndex = position777, tokenIndex777
}
- l776:
+ l778:
if buffer[position] != rune(')') {
- goto l769
+ goto l771
}
position++
- add(ruleBaseIndexScale, position770)
+ add(ruleBaseIndexScale, position772)
}
return true
- l769:
- position, tokenIndex = position769, tokenIndex769
+ l771:
+ position, tokenIndex = position771, tokenIndex771
return false
},
/* 50 Operator <- <('+' / '-')> */
func() bool {
- position785, tokenIndex785 := position, tokenIndex
+ position787, tokenIndex787 := position, tokenIndex
{
- position786 := position
+ position788 := position
{
- position787, tokenIndex787 := position, tokenIndex
+ position789, tokenIndex789 := position, tokenIndex
if buffer[position] != rune('+') {
- goto l788
+ goto l790
}
position++
- goto l787
- l788:
- position, tokenIndex = position787, tokenIndex787
+ goto l789
+ l790:
+ position, tokenIndex = position789, tokenIndex789
if buffer[position] != rune('-') {
- goto l785
+ goto l787
}
position++
}
- l787:
- add(ruleOperator, position786)
+ l789:
+ add(ruleOperator, position788)
}
return true
- l785:
- position, tokenIndex = position785, tokenIndex785
+ l787:
+ position, tokenIndex = position787, tokenIndex787
return false
},
/* 51 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / [0-9]+))> */
func() bool {
- position789, tokenIndex789 := position, tokenIndex
+ position791, tokenIndex791 := position, tokenIndex
{
- position790 := position
- {
- position791, tokenIndex791 := position, tokenIndex
- if buffer[position] != rune('+') {
- goto l791
- }
- position++
- goto l792
- l791:
- position, tokenIndex = position791, tokenIndex791
- }
- l792:
+ position792 := position
{
position793, tokenIndex793 := position, tokenIndex
- if buffer[position] != rune('-') {
+ if buffer[position] != rune('+') {
goto l793
}
position++
@@ -6116,284 +6112,295 @@
l794:
{
position795, tokenIndex795 := position, tokenIndex
+ if buffer[position] != rune('-') {
+ goto l795
+ }
+ position++
+ goto l796
+ l795:
+ position, tokenIndex = position795, tokenIndex795
+ }
+ l796:
+ {
+ position797, tokenIndex797 := position, tokenIndex
if buffer[position] != rune('0') {
- goto l796
+ goto l798
}
position++
{
- position797, tokenIndex797 := position, tokenIndex
+ position799, tokenIndex799 := position, tokenIndex
if buffer[position] != rune('b') {
+ goto l800
+ }
+ position++
+ goto l799
+ l800:
+ position, tokenIndex = position799, tokenIndex799
+ if buffer[position] != rune('B') {
goto l798
}
position++
- goto l797
- l798:
- position, tokenIndex = position797, tokenIndex797
- if buffer[position] != rune('B') {
- goto l796
- }
- position++
}
- l797:
- {
- position801, tokenIndex801 := position, tokenIndex
- if buffer[position] != rune('0') {
- goto l802
- }
- position++
- goto l801
- l802:
- position, tokenIndex = position801, tokenIndex801
- if buffer[position] != rune('1') {
- goto l796
- }
- position++
- }
- l801:
l799:
{
- position800, tokenIndex800 := position, tokenIndex
+ position803, tokenIndex803 := position, tokenIndex
+ if buffer[position] != rune('0') {
+ goto l804
+ }
+ position++
+ goto l803
+ l804:
+ position, tokenIndex = position803, tokenIndex803
+ if buffer[position] != rune('1') {
+ goto l798
+ }
+ position++
+ }
+ l803:
+ l801:
+ {
+ position802, tokenIndex802 := position, tokenIndex
{
- position803, tokenIndex803 := position, tokenIndex
+ position805, tokenIndex805 := position, tokenIndex
if buffer[position] != rune('0') {
- goto l804
+ goto l806
}
position++
- goto l803
- l804:
- position, tokenIndex = position803, tokenIndex803
+ goto l805
+ l806:
+ position, tokenIndex = position805, tokenIndex805
if buffer[position] != rune('1') {
- goto l800
+ goto l802
}
position++
}
- l803:
- goto l799
- l800:
- position, tokenIndex = position800, tokenIndex800
+ l805:
+ goto l801
+ l802:
+ position, tokenIndex = position802, tokenIndex802
}
- goto l795
- l796:
- position, tokenIndex = position795, tokenIndex795
+ goto l797
+ l798:
+ position, tokenIndex = position797, tokenIndex797
if buffer[position] != rune('0') {
- goto l805
+ goto l807
}
position++
{
- position806, tokenIndex806 := position, tokenIndex
+ position808, tokenIndex808 := position, tokenIndex
if buffer[position] != rune('x') {
+ goto l809
+ }
+ position++
+ goto l808
+ l809:
+ position, tokenIndex = position808, tokenIndex808
+ if buffer[position] != rune('X') {
goto l807
}
position++
- goto l806
- l807:
- position, tokenIndex = position806, tokenIndex806
- if buffer[position] != rune('X') {
- goto l805
- }
- position++
}
- l806:
- {
- position810, tokenIndex810 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l811
- }
- position++
- goto l810
- l811:
- position, tokenIndex = position810, tokenIndex810
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l812
- }
- position++
- goto l810
- l812:
- position, tokenIndex = position810, tokenIndex810
- {
- position813, tokenIndex813 := position, tokenIndex
- if c := buffer[position]; c < rune('a') || c > rune('f') {
- goto l814
- }
- position++
- goto l813
- l814:
- position, tokenIndex = position813, tokenIndex813
- if c := buffer[position]; c < rune('A') || c > rune('F') {
- goto l805
- }
- position++
- }
- l813:
- }
- l810:
l808:
{
- position809, tokenIndex809 := position, tokenIndex
+ position812, tokenIndex812 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l813
+ }
+ position++
+ goto l812
+ l813:
+ position, tokenIndex = position812, tokenIndex812
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l814
+ }
+ position++
+ goto l812
+ l814:
+ position, tokenIndex = position812, tokenIndex812
{
position815, tokenIndex815 := position, tokenIndex
- if c := buffer[position]; c < rune('0') || c > rune('9') {
+ if c := buffer[position]; c < rune('a') || c > rune('f') {
goto l816
}
position++
goto l815
l816:
position, tokenIndex = position815, tokenIndex815
- if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l817
+ if c := buffer[position]; c < rune('A') || c > rune('F') {
+ goto l807
}
position++
- goto l815
- l817:
- position, tokenIndex = position815, tokenIndex815
+ }
+ l815:
+ }
+ l812:
+ l810:
+ {
+ position811, tokenIndex811 := position, tokenIndex
+ {
+ position817, tokenIndex817 := position, tokenIndex
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l818
+ }
+ position++
+ goto l817
+ l818:
+ position, tokenIndex = position817, tokenIndex817
+ if c := buffer[position]; c < rune('0') || c > rune('9') {
+ goto l819
+ }
+ position++
+ goto l817
+ l819:
+ position, tokenIndex = position817, tokenIndex817
{
- position818, tokenIndex818 := position, tokenIndex
+ position820, tokenIndex820 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('f') {
- goto l819
+ goto l821
}
position++
- goto l818
- l819:
- position, tokenIndex = position818, tokenIndex818
+ goto l820
+ l821:
+ position, tokenIndex = position820, tokenIndex820
if c := buffer[position]; c < rune('A') || c > rune('F') {
- goto l809
+ goto l811
}
position++
}
- l818:
+ l820:
}
- l815:
- goto l808
- l809:
- position, tokenIndex = position809, tokenIndex809
+ l817:
+ goto l810
+ l811:
+ position, tokenIndex = position811, tokenIndex811
}
- goto l795
- l805:
- position, tokenIndex = position795, tokenIndex795
+ goto l797
+ l807:
+ position, tokenIndex = position797, tokenIndex797
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l789
+ goto l791
}
position++
- l820:
+ l822:
{
- position821, tokenIndex821 := position, tokenIndex
+ position823, tokenIndex823 := position, tokenIndex
if c := buffer[position]; c < rune('0') || c > rune('9') {
- goto l821
+ goto l823
}
position++
- goto l820
- l821:
- position, tokenIndex = position821, tokenIndex821
+ goto l822
+ l823:
+ position, tokenIndex = position823, tokenIndex823
}
}
- l795:
- add(ruleOffset, position790)
+ l797:
+ add(ruleOffset, position792)
}
return true
- l789:
- position, tokenIndex = position789, tokenIndex789
+ l791:
+ position, tokenIndex = position791, tokenIndex791
return false
},
/* 52 Section <- <([a-z] / [A-Z] / '@')+> */
func() bool {
- position822, tokenIndex822 := position, tokenIndex
+ position824, tokenIndex824 := position, tokenIndex
{
- position823 := position
+ position825 := position
{
- position826, tokenIndex826 := position, tokenIndex
+ position828, tokenIndex828 := position, tokenIndex
if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l827
+ goto l829
}
position++
+ goto l828
+ l829:
+ position, tokenIndex = position828, tokenIndex828
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l830
+ }
+ position++
+ goto l828
+ l830:
+ position, tokenIndex = position828, tokenIndex828
+ if buffer[position] != rune('@') {
+ goto l824
+ }
+ position++
+ }
+ l828:
+ l826:
+ {
+ position827, tokenIndex827 := position, tokenIndex
+ {
+ position831, tokenIndex831 := position, tokenIndex
+ if c := buffer[position]; c < rune('a') || c > rune('z') {
+ goto l832
+ }
+ position++
+ goto l831
+ l832:
+ position, tokenIndex = position831, tokenIndex831
+ if c := buffer[position]; c < rune('A') || c > rune('Z') {
+ goto l833
+ }
+ position++
+ goto l831
+ l833:
+ position, tokenIndex = position831, tokenIndex831
+ if buffer[position] != rune('@') {
+ goto l827
+ }
+ position++
+ }
+ l831:
goto l826
l827:
- position, tokenIndex = position826, tokenIndex826
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l828
- }
- position++
- goto l826
- l828:
- position, tokenIndex = position826, tokenIndex826
- if buffer[position] != rune('@') {
- goto l822
- }
- position++
+ position, tokenIndex = position827, tokenIndex827
}
- l826:
- l824:
- {
- position825, tokenIndex825 := position, tokenIndex
- {
- position829, tokenIndex829 := position, tokenIndex
- if c := buffer[position]; c < rune('a') || c > rune('z') {
- goto l830
- }
- position++
- goto l829
- l830:
- position, tokenIndex = position829, tokenIndex829
- if c := buffer[position]; c < rune('A') || c > rune('Z') {
- goto l831
- }
- position++
- goto l829
- l831:
- position, tokenIndex = position829, tokenIndex829
- if buffer[position] != rune('@') {
- goto l825
- }
- position++
- }
- l829:
- goto l824
- l825:
- position, tokenIndex = position825, tokenIndex825
- }
- add(ruleSection, position823)
+ add(ruleSection, position825)
}
return true
- l822:
- position, tokenIndex = position822, tokenIndex822
+ l824:
+ position, tokenIndex = position824, tokenIndex824
return false
},
/* 53 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */
func() bool {
- position832, tokenIndex832 := position, tokenIndex
+ position834, tokenIndex834 := position, tokenIndex
{
- position833 := position
+ position835 := position
if buffer[position] != rune('%') {
- goto l832
+ goto l834
}
position++
{
- position834, tokenIndex834 := position, tokenIndex
+ position836, tokenIndex836 := position, tokenIndex
if c := buffer[position]; c < rune('c') || c > rune('g') {
- goto l835
+ goto l837
}
position++
- goto l834
- l835:
- position, tokenIndex = position834, tokenIndex834
+ goto l836
+ l837:
+ position, tokenIndex = position836, tokenIndex836
if buffer[position] != rune('s') {
- goto l832
+ goto l834
}
position++
}
- l834:
+ l836:
if buffer[position] != rune('s') {
- goto l832
+ goto l834
}
position++
if buffer[position] != rune(':') {
- goto l832
+ goto l834
}
position++
- add(ruleSegmentRegister, position833)
+ add(ruleSegmentRegister, position835)
}
return true
- l832:
- position, tokenIndex = position832, tokenIndex832
+ l834:
+ position, tokenIndex = position834, tokenIndex834
return false
},
}
diff --git a/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s b/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
index c54756b..7e48e27 100644
--- a/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
+++ b/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
@@ -47,3 +47,4 @@
.L4: .L5: movq %rbx, %rax # This is also legal.
.size foo, .-foo
.type foo, @function
+.uleb128 .foo-1-.bar
diff --git a/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s b/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
index 23e97c8..a55e852 100644
--- a/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
+++ b/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
@@ -55,6 +55,7 @@
movq %rbx, %rax # This is also legal.
.size foo, .-foo
.type foo, @function
+.uleb128 .foo-1-.bar
.text
.loc 2 2 0
BORINGSSL_bcm_text_end:
diff --git a/src/util/fipstools/test-break-kat.sh b/src/util/fipstools/test-break-kat.sh
new file mode 100644
index 0000000..d2c44a7
--- /dev/null
+++ b/src/util/fipstools/test-break-kat.sh
@@ -0,0 +1,40 @@
+# Copyright (c) 2022, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# This script attempts to break each of the known KATs and checks that doing so
+# seems to work and at least mentions the correct KAT in the output.
+
+set -x
+set -e
+
+TEST_FIPS_BIN="build/util/fipstools/test_fips"
+
+if [ ! -f $TEST_FIPS_BIN ]; then
+ echo "$TEST_FIPS_BIN is missing. Run this script from the top level of a"
+ echo "BoringSSL checkout and ensure that ./build-fips-break-test-binaries.sh"
+ echo "has been run first."
+ exit 1
+fi
+
+KATS=$(go run util/fipstools/break-kat.go --list-tests)
+
+for kat in $KATS; do
+ go run util/fipstools/break-kat.go $TEST_FIPS_BIN $kat > break-kat-bin
+ chmod u+x ./break-kat-bin
+ if ! (./break-kat-bin 2>&1 || true) | egrep -q "^$kat[^a-zA-Z0-9]"; then
+ echo "Failure for $kat did not mention that name in the output"
+ exit 1
+ fi
+ rm ./break-kat-bin
+done
diff --git a/src/util/fipstools/cavp/test_fips.c b/src/util/fipstools/test_fips.c
similarity index 98%
rename from src/util/fipstools/cavp/test_fips.c
rename to src/util/fipstools/test_fips.c
index dd82d65..b3d5521 100644
--- a/src/util/fipstools/cavp/test_fips.c
+++ b/src/util/fipstools/test_fips.c
@@ -30,9 +30,9 @@
#include <openssl/rsa.h>
#include <openssl/sha.h>
-#include "../crypto/fipsmodule/rand/internal.h"
-#include "../crypto/fipsmodule/tls/internal.h"
-#include "../crypto/internal.h"
+#include "../../crypto/fipsmodule/rand/internal.h"
+#include "../../crypto/fipsmodule/tls/internal.h"
+#include "../../crypto/internal.h"
static void hexdump(const void *a, size_t len) {
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 5cc2de4..3263d9b 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -26,15 +26,15 @@
# OS_ARCH_COMBOS maps from OS and platform to the OpenSSL assembly "style" for
# that platform and the extension used by asm files.
OS_ARCH_COMBOS = [
- ('ios', 'arm', 'ios32', [], 'S'),
- ('ios', 'aarch64', 'ios64', [], 'S'),
+ ('apple', 'arm', 'ios32', [], 'S'),
+ ('apple', 'aarch64', 'ios64', [], 'S'),
+ ('apple', 'x86', 'macosx', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
+ ('apple', 'x86_64', 'macosx', [], 'S'),
('linux', 'arm', 'linux32', [], 'S'),
('linux', 'aarch64', 'linux64', [], 'S'),
('linux', 'ppc64le', 'linux64le', [], 'S'),
('linux', 'x86', 'elf', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
('linux', 'x86_64', 'elf', [], 'S'),
- ('mac', 'x86', 'macosx', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
- ('mac', 'x86_64', 'macosx', [], 'S'),
('win', 'x86', 'win32n', ['-DOPENSSL_IA32_SSE2'], 'asm'),
('win', 'x86_64', 'nasm', [], 'asm'),
('win', 'aarch64', 'win64', [], 'S'),
@@ -586,12 +586,8 @@
asm_files)
cmake.write(
-R'''if(APPLE AND ARCH STREQUAL "aarch64")
- set(CRYPTO_ARCH_SOURCES ${CRYPTO_ios_aarch64_SOURCES})
-elseif(APPLE AND ARCH STREQUAL "arm")
- set(CRYPTO_ARCH_SOURCES ${CRYPTO_ios_arm_SOURCES})
-elseif(APPLE)
- set(CRYPTO_ARCH_SOURCES ${CRYPTO_mac_${ARCH}_SOURCES})
+R'''if(APPLE)
+ set(CRYPTO_ARCH_SOURCES ${CRYPTO_apple_${ARCH}_SOURCES})
elseif(UNIX)
set(CRYPTO_ARCH_SOURCES ${CRYPTO_linux_${ARCH}_SOURCES})
elseif(WIN32)
diff --git a/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm b/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
index 68c74cc..49be6f6 100644
--- a/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
@@ -31,6 +31,8 @@
mov r9d,DWORD[r11]
mov r10d,DWORD[4+r11]
mov r11d,DWORD[8+r11]
+ test r11d,536870912
+ jnz NEAR $L$shaext_shortcut
and r9d,1073741824
and r10d,268435968
or r10d,r9d
@@ -1795,6 +1797,240 @@
DB 111,114,103,62,0
ALIGN 64
+sha256_block_data_order_shaext:
+ mov QWORD[8+rsp],rdi ;WIN64 prologue
+ mov QWORD[16+rsp],rsi
+ mov rax,rsp
+$L$SEH_begin_sha256_block_data_order_shaext:
+ mov rdi,rcx
+ mov rsi,rdx
+ mov rdx,r8
+
+
+
+$L$shaext_shortcut:
+ lea rsp,[((-88))+rsp]
+ movaps XMMWORD[(-8-80)+rax],xmm6
+ movaps XMMWORD[(-8-64)+rax],xmm7
+ movaps XMMWORD[(-8-48)+rax],xmm8
+ movaps XMMWORD[(-8-32)+rax],xmm9
+ movaps XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+ lea rcx,[((K256+128))]
+ movdqu xmm1,XMMWORD[rdi]
+ movdqu xmm2,XMMWORD[16+rdi]
+ movdqa xmm7,XMMWORD[((512-128))+rcx]
+
+ pshufd xmm0,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ pshufd xmm2,xmm2,0x1b
+ movdqa xmm8,xmm7
+DB 102,15,58,15,202,8
+ punpcklqdq xmm2,xmm0
+ jmp NEAR $L$oop_shaext
+
+ALIGN 16
+$L$oop_shaext:
+ movdqu xmm3,XMMWORD[rsi]
+ movdqu xmm4,XMMWORD[16+rsi]
+ movdqu xmm5,XMMWORD[32+rsi]
+DB 102,15,56,0,223
+ movdqu xmm6,XMMWORD[48+rsi]
+
+ movdqa xmm0,XMMWORD[((0-128))+rcx]
+ paddd xmm0,xmm3
+DB 102,15,56,0,231
+ movdqa xmm10,xmm2
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ nop
+ movdqa xmm9,xmm1
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((32-128))+rcx]
+ paddd xmm0,xmm4
+DB 102,15,56,0,239
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ lea rsi,[64+rsi]
+DB 15,56,204,220
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((64-128))+rcx]
+ paddd xmm0,xmm5
+DB 102,15,56,0,247
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((96-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((128-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((160-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+DB 15,56,204,220
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((192-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,205,245
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((224-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((256-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((288-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+ nop
+ paddd xmm6,xmm7
+DB 15,56,204,220
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((320-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,205,245
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm6
+DB 102,15,58,15,253,4
+ nop
+ paddd xmm3,xmm7
+DB 15,56,204,229
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((352-128))+rcx]
+ paddd xmm0,xmm6
+DB 15,56,205,222
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm3
+DB 102,15,58,15,254,4
+ nop
+ paddd xmm4,xmm7
+DB 15,56,204,238
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((384-128))+rcx]
+ paddd xmm0,xmm3
+DB 15,56,205,227
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm4
+DB 102,15,58,15,251,4
+ nop
+ paddd xmm5,xmm7
+DB 15,56,204,243
+DB 15,56,203,202
+ movdqa xmm0,XMMWORD[((416-128))+rcx]
+ paddd xmm0,xmm4
+DB 15,56,205,236
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ movdqa xmm7,xmm5
+DB 102,15,58,15,252,4
+DB 15,56,203,202
+ paddd xmm6,xmm7
+
+ movdqa xmm0,XMMWORD[((448-128))+rcx]
+ paddd xmm0,xmm5
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+DB 15,56,205,245
+ movdqa xmm7,xmm8
+DB 15,56,203,202
+
+ movdqa xmm0,XMMWORD[((480-128))+rcx]
+ paddd xmm0,xmm6
+ nop
+DB 15,56,203,209
+ pshufd xmm0,xmm0,0x0e
+ dec rdx
+ nop
+DB 15,56,203,202
+
+ paddd xmm2,xmm10
+ paddd xmm1,xmm9
+ jnz NEAR $L$oop_shaext
+
+ pshufd xmm2,xmm2,0xb1
+ pshufd xmm7,xmm1,0x1b
+ pshufd xmm1,xmm1,0xb1
+ punpckhqdq xmm1,xmm2
+DB 102,15,58,15,215,8
+
+ movdqu XMMWORD[rdi],xmm1
+ movdqu XMMWORD[16+rdi],xmm2
+ movaps xmm6,XMMWORD[((-8-80))+rax]
+ movaps xmm7,XMMWORD[((-8-64))+rax]
+ movaps xmm8,XMMWORD[((-8-48))+rax]
+ movaps xmm9,XMMWORD[((-8-32))+rax]
+ movaps xmm10,XMMWORD[((-8-16))+rax]
+ mov rsp,rax
+$L$epilogue_shaext:
+ mov rdi,QWORD[8+rsp] ;WIN64 epilogue
+ mov rsi,QWORD[16+rsp]
+ DB 0F3h,0C3h ;repret
+
+$L$SEH_end_sha256_block_data_order_shaext:
+
+ALIGN 64
sha256_block_data_order_ssse3:
mov QWORD[8+rsp],rdi ;WIN64 prologue
mov QWORD[16+rsp],rsi
@@ -4115,11 +4351,46 @@
pop rsi
DB 0F3h,0C3h ;repret
+
+ALIGN 16
+shaext_handler:
+ push rsi
+ push rdi
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ push r15
+ pushfq
+ sub rsp,64
+
+ mov rax,QWORD[120+r8]
+ mov rbx,QWORD[248+r8]
+
+ lea r10,[$L$prologue_shaext]
+ cmp rbx,r10
+ jb NEAR $L$in_prologue
+
+ lea r10,[$L$epilogue_shaext]
+ cmp rbx,r10
+ jae NEAR $L$in_prologue
+
+ lea rsi,[((-8-80))+rax]
+ lea rdi,[512+r8]
+ mov ecx,10
+ DD 0xa548f3fc
+
+ jmp NEAR $L$in_prologue
+
section .pdata rdata align=4
ALIGN 4
DD $L$SEH_begin_sha256_block_data_order wrt ..imagebase
DD $L$SEH_end_sha256_block_data_order wrt ..imagebase
DD $L$SEH_info_sha256_block_data_order wrt ..imagebase
+ DD $L$SEH_begin_sha256_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_end_sha256_block_data_order_shaext wrt ..imagebase
+ DD $L$SEH_info_sha256_block_data_order_shaext wrt ..imagebase
DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
@@ -4132,6 +4403,9 @@
DB 9,0,0,0
DD se_handler wrt ..imagebase
DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_shaext:
+DB 9,0,0,0
+ DD shaext_handler wrt ..imagebase
$L$SEH_info_sha256_block_data_order_ssse3:
DB 9,0,0,0
DD se_handler wrt ..imagebase