Snap for 8253222 from 1f65662f464ec9b476981a5067dc949612e9adde to sdk-release

Change-Id: I380b08c675ac9177cb0b2ec5b554a75be6d90021
diff --git a/Android.bp b/Android.bp
index 416bfd8..ea7a7bd 100644
--- a/Android.bp
+++ b/Android.bp
@@ -60,17 +60,11 @@
         "-Werror",
     ],
 
-    conlyflags: ["-std=c99"],
+    c_std: "gnu11",
 
     // Build BoringSSL and its tests against the same STL.
     sdk_version: "9",
     target: {
-        linux: {
-            cflags: ["-D_XOPEN_SOURCE=700"],
-        },
-        windows: {
-            cflags: ["-D_XOPEN_SOURCE=700"],
-        },
         android: {
             stl: "libc++_static",
         },
@@ -421,53 +415,6 @@
     },
 }
 
-// Used for CAVP testing for FIPS certification.
-// Not installed on devices by default.
-cc_binary {
-    name: "cavp",
-    host_supported: true,
-    srcs: [
-        "src/util/fipstools/cavp/cavp_aes_gcm_test.cc",
-        "src/util/fipstools/cavp/cavp_aes_test.cc",
-        "src/util/fipstools/cavp/cavp_ctr_drbg_test.cc",
-        "src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc",
-        "src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc",
-        "src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc",
-        "src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc",
-        "src/util/fipstools/cavp/cavp_hmac_test.cc",
-        "src/util/fipstools/cavp/cavp_kas_test.cc",
-        "src/util/fipstools/cavp/cavp_keywrap_test.cc",
-        "src/util/fipstools/cavp/cavp_main.cc",
-        "src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc",
-        "src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc",
-        "src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc",
-        "src/util/fipstools/cavp/cavp_sha_monte_test.cc",
-        "src/util/fipstools/cavp/cavp_sha_test.cc",
-        "src/util/fipstools/cavp/cavp_tdes_test.cc",
-        "src/util/fipstools/cavp/cavp_test_util.cc",
-        "src/util/fipstools/cavp/cavp_tlskdf_test.cc",
-    ],
-    target: {
-        android: {
-            compile_multilib: "both",
-        },
-    },
-    multilib: {
-        lib32: {
-            suffix: "32",
-        },
-    },
-
-    shared_libs: [
-        "libcrypto",
-    ],
-
-    defaults: [
-        "boringssl_test_support_sources",
-        "boringssl_flags",
-    ],
-}
-
 // Used for ACVP testing for FIPS certification.
 // Not installed on devices by default.
 cc_binary {
@@ -584,6 +531,6 @@
         "libcrypto",
     ],
     srcs: [
-        "src/util/fipstools/cavp/test_fips.c",
+        "src/util/fipstools/test_fips.c",
     ],
 }
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index 9fc401a..95a1efc 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-345c86b1cfcc478a71a9a71f0206893fd16ae912
+81502beeddc5f116d44d0898c6c4a33057198db8
diff --git a/BUILD.generated.bzl b/BUILD.generated.bzl
index 8621d9b..bf9efa7 100644
--- a/BUILD.generated.bzl
+++ b/BUILD.generated.bzl
@@ -37,8 +37,6 @@
     "src/crypto/fipsmodule/cipher/aead.c",
     "src/crypto/fipsmodule/cipher/cipher.c",
     "src/crypto/fipsmodule/cipher/e_aes.c",
-    "src/crypto/fipsmodule/cipher/e_des.c",
-    "src/crypto/fipsmodule/des/des.c",
     "src/crypto/fipsmodule/dh/check.c",
     "src/crypto/fipsmodule/dh/dh.c",
     "src/crypto/fipsmodule/digest/digest.c",
@@ -218,6 +216,7 @@
     "src/crypto/cpu_arm_linux.h",
     "src/crypto/curve25519/curve25519_tables.h",
     "src/crypto/curve25519/internal.h",
+    "src/crypto/des/internal.h",
     "src/crypto/dsa/internal.h",
     "src/crypto/ec_extra/internal.h",
     "src/crypto/err/internal.h",
@@ -227,7 +226,7 @@
     "src/crypto/fipsmodule/bn/rsaz_exp.h",
     "src/crypto/fipsmodule/cipher/internal.h",
     "src/crypto/fipsmodule/delocate.h",
-    "src/crypto/fipsmodule/des/internal.h",
+    "src/crypto/fipsmodule/dh/internal.h",
     "src/crypto/fipsmodule/digest/internal.h",
     "src/crypto/fipsmodule/digest/md32_common.h",
     "src/crypto/fipsmodule/ec/internal.h",
@@ -320,6 +319,7 @@
     "src/crypto/cipher_extra/e_aesctrhmac.c",
     "src/crypto/cipher_extra/e_aesgcmsiv.c",
     "src/crypto/cipher_extra/e_chacha20poly1305.c",
+    "src/crypto/cipher_extra/e_des.c",
     "src/crypto/cipher_extra/e_null.c",
     "src/crypto/cipher_extra/e_rc2.c",
     "src/crypto/cipher_extra/e_rc4.c",
@@ -338,6 +338,7 @@
     "src/crypto/crypto.c",
     "src/crypto/curve25519/curve25519.c",
     "src/crypto/curve25519/spake25519.c",
+    "src/crypto/des/des.c",
     "src/crypto/dh_extra/dh_asn1.c",
     "src/crypto/dh_extra/params.c",
     "src/crypto/digest_extra/digest_extra.c",
@@ -520,31 +521,69 @@
     "src/tool/transport_common.h",
 ]
 
-crypto_sources_ios_aarch64 = [
-    "ios-aarch64/crypto/chacha/chacha-armv8.S",
-    "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S",
-    "ios-aarch64/crypto/fipsmodule/armv8-mont.S",
-    "ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
-    "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
-    "ios-aarch64/crypto/fipsmodule/sha1-armv8.S",
-    "ios-aarch64/crypto/fipsmodule/sha256-armv8.S",
-    "ios-aarch64/crypto/fipsmodule/sha512-armv8.S",
-    "ios-aarch64/crypto/fipsmodule/vpaes-armv8.S",
-    "ios-aarch64/crypto/test/trampoline-armv8.S",
+crypto_sources_apple_aarch64 = [
+    "apple-aarch64/crypto/chacha/chacha-armv8.S",
+    "apple-aarch64/crypto/fipsmodule/aesv8-armx64.S",
+    "apple-aarch64/crypto/fipsmodule/armv8-mont.S",
+    "apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S",
+    "apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S",
+    "apple-aarch64/crypto/fipsmodule/sha1-armv8.S",
+    "apple-aarch64/crypto/fipsmodule/sha256-armv8.S",
+    "apple-aarch64/crypto/fipsmodule/sha512-armv8.S",
+    "apple-aarch64/crypto/fipsmodule/vpaes-armv8.S",
+    "apple-aarch64/crypto/test/trampoline-armv8.S",
 ]
 
-crypto_sources_ios_arm = [
-    "ios-arm/crypto/chacha/chacha-armv4.S",
-    "ios-arm/crypto/fipsmodule/aesv8-armx32.S",
-    "ios-arm/crypto/fipsmodule/armv4-mont.S",
-    "ios-arm/crypto/fipsmodule/bsaes-armv7.S",
-    "ios-arm/crypto/fipsmodule/ghash-armv4.S",
-    "ios-arm/crypto/fipsmodule/ghashv8-armx32.S",
-    "ios-arm/crypto/fipsmodule/sha1-armv4-large.S",
-    "ios-arm/crypto/fipsmodule/sha256-armv4.S",
-    "ios-arm/crypto/fipsmodule/sha512-armv4.S",
-    "ios-arm/crypto/fipsmodule/vpaes-armv7.S",
-    "ios-arm/crypto/test/trampoline-armv4.S",
+crypto_sources_apple_arm = [
+    "apple-arm/crypto/chacha/chacha-armv4.S",
+    "apple-arm/crypto/fipsmodule/aesv8-armx32.S",
+    "apple-arm/crypto/fipsmodule/armv4-mont.S",
+    "apple-arm/crypto/fipsmodule/bsaes-armv7.S",
+    "apple-arm/crypto/fipsmodule/ghash-armv4.S",
+    "apple-arm/crypto/fipsmodule/ghashv8-armx32.S",
+    "apple-arm/crypto/fipsmodule/sha1-armv4-large.S",
+    "apple-arm/crypto/fipsmodule/sha256-armv4.S",
+    "apple-arm/crypto/fipsmodule/sha512-armv4.S",
+    "apple-arm/crypto/fipsmodule/vpaes-armv7.S",
+    "apple-arm/crypto/test/trampoline-armv4.S",
+]
+
+crypto_sources_apple_x86 = [
+    "apple-x86/crypto/chacha/chacha-x86.S",
+    "apple-x86/crypto/fipsmodule/aesni-x86.S",
+    "apple-x86/crypto/fipsmodule/bn-586.S",
+    "apple-x86/crypto/fipsmodule/co-586.S",
+    "apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
+    "apple-x86/crypto/fipsmodule/ghash-x86.S",
+    "apple-x86/crypto/fipsmodule/md5-586.S",
+    "apple-x86/crypto/fipsmodule/sha1-586.S",
+    "apple-x86/crypto/fipsmodule/sha256-586.S",
+    "apple-x86/crypto/fipsmodule/sha512-586.S",
+    "apple-x86/crypto/fipsmodule/vpaes-x86.S",
+    "apple-x86/crypto/fipsmodule/x86-mont.S",
+    "apple-x86/crypto/test/trampoline-x86.S",
+]
+
+crypto_sources_apple_x86_64 = [
+    "apple-x86_64/crypto/chacha/chacha-x86_64.S",
+    "apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
+    "apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/aesni-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/ghash-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/md5-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
+    "apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
+    "apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/rsaz-avx2.S",
+    "apple-x86_64/crypto/fipsmodule/sha1-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/sha256-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/sha512-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
+    "apple-x86_64/crypto/fipsmodule/x86_64-mont.S",
+    "apple-x86_64/crypto/fipsmodule/x86_64-mont5.S",
+    "apple-x86_64/crypto/test/trampoline-x86_64.S",
 ]
 
 crypto_sources_linux_aarch64 = [
@@ -621,44 +660,6 @@
     "src/crypto/hrss/asm/poly_rq_mul.S",
 ]
 
-crypto_sources_mac_x86 = [
-    "mac-x86/crypto/chacha/chacha-x86.S",
-    "mac-x86/crypto/fipsmodule/aesni-x86.S",
-    "mac-x86/crypto/fipsmodule/bn-586.S",
-    "mac-x86/crypto/fipsmodule/co-586.S",
-    "mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S",
-    "mac-x86/crypto/fipsmodule/ghash-x86.S",
-    "mac-x86/crypto/fipsmodule/md5-586.S",
-    "mac-x86/crypto/fipsmodule/sha1-586.S",
-    "mac-x86/crypto/fipsmodule/sha256-586.S",
-    "mac-x86/crypto/fipsmodule/sha512-586.S",
-    "mac-x86/crypto/fipsmodule/vpaes-x86.S",
-    "mac-x86/crypto/fipsmodule/x86-mont.S",
-    "mac-x86/crypto/test/trampoline-x86.S",
-]
-
-crypto_sources_mac_x86_64 = [
-    "mac-x86_64/crypto/chacha/chacha-x86_64.S",
-    "mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S",
-    "mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/md5-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S",
-    "mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S",
-    "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S",
-    "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/sha256-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/sha512-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S",
-    "mac-x86_64/crypto/fipsmodule/x86_64-mont.S",
-    "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S",
-    "mac-x86_64/crypto/test/trampoline-x86_64.S",
-]
-
 crypto_sources_win_aarch64 = [
     "win-aarch64/crypto/chacha/chacha-armv8.S",
     "win-aarch64/crypto/fipsmodule/aesv8-armx64.S",
diff --git a/BUILD.generated_tests.bzl b/BUILD.generated_tests.bzl
index 532ca40..51a5ea4 100644
--- a/BUILD.generated_tests.bzl
+++ b/BUILD.generated_tests.bzl
@@ -12,6 +12,7 @@
     "src/crypto/cpu_arm_linux.h",
     "src/crypto/curve25519/curve25519_tables.h",
     "src/crypto/curve25519/internal.h",
+    "src/crypto/des/internal.h",
     "src/crypto/dsa/internal.h",
     "src/crypto/ec_extra/internal.h",
     "src/crypto/err/internal.h",
@@ -21,7 +22,7 @@
     "src/crypto/fipsmodule/bn/rsaz_exp.h",
     "src/crypto/fipsmodule/cipher/internal.h",
     "src/crypto/fipsmodule/delocate.h",
-    "src/crypto/fipsmodule/des/internal.h",
+    "src/crypto/fipsmodule/dh/internal.h",
     "src/crypto/fipsmodule/digest/internal.h",
     "src/crypto/fipsmodule/digest/md32_common.h",
     "src/crypto/fipsmodule/ec/internal.h",
diff --git a/android-sources.cmake b/android-sources.cmake
index 841eed9..15079b3 100644
--- a/android-sources.cmake
+++ b/android-sources.cmake
@@ -75,6 +75,7 @@
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesctrhmac.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesgcmsiv.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_chacha20poly1305.c
+  ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_des.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_null.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc2.c
   ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc4.c
@@ -93,6 +94,7 @@
   ${BORINGSSL_ROOT}src/crypto/crypto.c
   ${BORINGSSL_ROOT}src/crypto/curve25519/curve25519.c
   ${BORINGSSL_ROOT}src/crypto/curve25519/spake25519.c
+  ${BORINGSSL_ROOT}src/crypto/des/des.c
   ${BORINGSSL_ROOT}src/crypto/dh_extra/dh_asn1.c
   ${BORINGSSL_ROOT}src/crypto/dh_extra/params.c
   ${BORINGSSL_ROOT}src/crypto/digest_extra/digest_extra.c
@@ -388,30 +390,66 @@
   ${BORINGSSL_ROOT}src/ssl/ssl_c_test.c
   ${BORINGSSL_ROOT}src/ssl/ssl_test.cc
 )
-set(crypto_sources_ios_aarch64
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/chacha/chacha-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/aesv8-armx64.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/armv8-mont.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha1-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha256-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/sha512-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/fipsmodule/vpaes-armv8.S
-  ${BORINGSSL_ROOT}ios-aarch64/crypto/test/trampoline-armv8.S
+set(crypto_sources_apple_aarch64
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/chacha/chacha-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/armv8-mont.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha1-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha256-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha512-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
+  ${BORINGSSL_ROOT}apple-aarch64/crypto/test/trampoline-armv8.S
 )
-set(crypto_sources_ios_arm
-  ${BORINGSSL_ROOT}ios-arm/crypto/chacha/chacha-armv4.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/aesv8-armx32.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/armv4-mont.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/bsaes-armv7.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghash-armv4.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/ghashv8-armx32.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha1-armv4-large.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha256-armv4.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/sha512-armv4.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/fipsmodule/vpaes-armv7.S
-  ${BORINGSSL_ROOT}ios-arm/crypto/test/trampoline-armv4.S
+set(crypto_sources_apple_arm
+  ${BORINGSSL_ROOT}apple-arm/crypto/chacha/chacha-armv4.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/aesv8-armx32.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/armv4-mont.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/bsaes-armv7.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghash-armv4.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghashv8-armx32.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha1-armv4-large.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha256-armv4.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha512-armv4.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/vpaes-armv7.S
+  ${BORINGSSL_ROOT}apple-arm/crypto/test/trampoline-armv4.S
+)
+set(crypto_sources_apple_x86
+  ${BORINGSSL_ROOT}apple-x86/crypto/chacha/chacha-x86.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/aesni-x86.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/bn-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/co-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-x86.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/md5-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha1-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha256-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha512-586.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/vpaes-x86.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/x86-mont.S
+  ${BORINGSSL_ROOT}apple-x86/crypto/test/trampoline-x86.S
+)
+set(crypto_sources_apple_x86_64
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/chacha/chacha-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/md5-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
+  ${BORINGSSL_ROOT}apple-x86_64/crypto/test/trampoline-x86_64.S
 )
 set(crypto_sources_linux_aarch64
   ${BORINGSSL_ROOT}linux-aarch64/crypto/chacha/chacha-armv8.S
@@ -482,42 +520,6 @@
   ${BORINGSSL_ROOT}linux-x86_64/crypto/test/trampoline-x86_64.S
   ${BORINGSSL_ROOT}src/crypto/hrss/asm/poly_rq_mul.S
 )
-set(crypto_sources_mac_x86
-  ${BORINGSSL_ROOT}mac-x86/crypto/chacha/chacha-x86.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/aesni-x86.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/bn-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/co-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/ghash-x86.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/md5-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha1-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha256-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/sha512-586.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/vpaes-x86.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/fipsmodule/x86-mont.S
-  ${BORINGSSL_ROOT}mac-x86/crypto/test/trampoline-x86.S
-)
-set(crypto_sources_mac_x86_64
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/chacha/chacha-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/aesni-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/ghash-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/md5-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/rsaz-avx2.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha1-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/fipsmodule/x86_64-mont5.S
-  ${BORINGSSL_ROOT}mac-x86_64/crypto/test/trampoline-x86_64.S
-)
 set(crypto_sources_win_aarch64
   ${BORINGSSL_ROOT}win-aarch64/crypto/chacha/chacha-armv8.S
   ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/aesv8-armx64.S
diff --git a/apple-aarch64/crypto/chacha/chacha-armv8.S b/apple-aarch64/crypto/chacha/chacha-armv8.S
new file mode 100644
index 0000000..dd992a2
--- /dev/null
+++ b/apple-aarch64/crypto/chacha/chacha-armv8.S
@@ -0,0 +1,1992 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+
+.private_extern	_OPENSSL_armcap_P
+
+.section	__TEXT,__const
+
+.align	5
+Lsigma:
+.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
+Lone:
+.long	1,0,0,0
+.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+
+.text
+
+.globl	_ChaCha20_ctr32
+.private_extern	_ChaCha20_ctr32
+
+.align	5
+_ChaCha20_ctr32:
+	AARCH64_VALID_CALL_TARGET
+	cbz	x2,Labort
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x5,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+	adrp	x5,_OPENSSL_armcap_P@PAGE
+#endif
+	cmp	x2,#192
+	b.lo	Lshort
+	ldr	w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
+	tst	w17,#ARMV7_NEON
+	b.ne	ChaCha20_neon
+
+Lshort:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ldp	x28,x30,[x4]		// load counter
+#ifdef	__AARCH64EB__
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+
+Loop_outer:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#64
+Loop:
+	sub	x4,x4,#1
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	ror	w21,w21,#16
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#20
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	add	w5,w5,w9
+	add	w6,w6,w10
+	add	w7,w7,w11
+	add	w8,w8,w12
+	eor	w17,w17,w5
+	eor	w19,w19,w6
+	eor	w20,w20,w7
+	eor	w21,w21,w8
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	ror	w21,w21,#24
+	add	w13,w13,w17
+	add	w14,w14,w19
+	add	w15,w15,w20
+	add	w16,w16,w21
+	eor	w9,w9,w13
+	eor	w10,w10,w14
+	eor	w11,w11,w15
+	eor	w12,w12,w16
+	ror	w9,w9,#25
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#16
+	ror	w17,w17,#16
+	ror	w19,w19,#16
+	ror	w20,w20,#16
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#20
+	ror	w11,w11,#20
+	ror	w12,w12,#20
+	ror	w9,w9,#20
+	add	w5,w5,w10
+	add	w6,w6,w11
+	add	w7,w7,w12
+	add	w8,w8,w9
+	eor	w21,w21,w5
+	eor	w17,w17,w6
+	eor	w19,w19,w7
+	eor	w20,w20,w8
+	ror	w21,w21,#24
+	ror	w17,w17,#24
+	ror	w19,w19,#24
+	ror	w20,w20,#24
+	add	w15,w15,w21
+	add	w16,w16,w17
+	add	w13,w13,w19
+	add	w14,w14,w20
+	eor	w10,w10,w15
+	eor	w11,w11,w16
+	eor	w12,w12,w13
+	eor	w9,w9,w14
+	ror	w10,w10,#25
+	ror	w11,w11,#25
+	ror	w12,w12,#25
+	ror	w9,w9,#25
+	cbnz	x4,Loop
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	b.lo	Ltail
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+
+	b.hi	Loop_outer
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+Labort:
+	ret
+
+.align	4
+Ltail:
+	add	x2,x2,#64
+Less_than_64:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	stp	x5,x7,[sp,#0]
+	stp	x9,x11,[sp,#16]
+	stp	x13,x15,[sp,#32]
+	stp	x17,x20,[sp,#48]
+
+Loop_tail:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+
+.align	5
+ChaCha20_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	cmp	x2,#512
+	b.hs	L512_or_more_neon
+
+	sub	sp,sp,#64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+Loop_outer_neon:
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	mov	v0.16b,v24.16b
+	mov	w7,w23
+	lsr	x8,x23,#32
+	mov	v4.16b,v24.16b
+	mov	w9,w24
+	lsr	x10,x24,#32
+	mov	v16.16b,v24.16b
+	mov	w11,w25
+	mov	v1.16b,v25.16b
+	lsr	x12,x25,#32
+	mov	v5.16b,v25.16b
+	mov	w13,w26
+	mov	v17.16b,v25.16b
+	lsr	x14,x26,#32
+	mov	v3.16b,v27.16b
+	mov	w15,w27
+	mov	v7.16b,v28.16b
+	lsr	x16,x27,#32
+	mov	v19.16b,v29.16b
+	mov	w17,w28
+	mov	v2.16b,v26.16b
+	lsr	x19,x28,#32
+	mov	v6.16b,v26.16b
+	mov	w20,w30
+	mov	v18.16b,v26.16b
+	lsr	x21,x30,#32
+
+	mov	x4,#10
+	subs	x2,x2,#256
+Loop_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w11
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w12
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w17,w17,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w19,w19,w6
+	rev32	v3.8h,v3.8h
+	eor	w20,w20,w7
+	rev32	v7.8h,v7.8h
+	eor	w21,w21,w8
+	rev32	v19.8h,v19.8h
+	ror	w17,w17,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#20
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#20
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#20
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#12
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#12
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#12
+	ror	w9,w9,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w10,w10,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w11,w11,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w12,w12,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w9
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w10
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w11
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w12
+	ushr	v7.4s,v21.4s,#24
+	eor	w17,w17,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w19,w19,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w20,w20,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w21,w21,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w17,w17,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w19,w19,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w20,w20,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w21,w21,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w13,w13,w17
+	eor	v21.16b,v5.16b,v6.16b
+	add	w14,w14,w19
+	eor	v22.16b,v17.16b,v18.16b
+	add	w15,w15,w20
+	ushr	v1.4s,v20.4s,#25
+	add	w16,w16,w21
+	ushr	v5.4s,v21.4s,#25
+	eor	w9,w9,w13
+	ushr	v17.4s,v22.4s,#25
+	eor	w10,w10,w14
+	sli	v1.4s,v20.4s,#7
+	eor	w11,w11,w15
+	sli	v5.4s,v21.4s,#7
+	eor	w12,w12,w16
+	sli	v17.4s,v22.4s,#7
+	ror	w9,w9,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w10,w10,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w10
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w11
+	add	v16.4s,v16.4s,v17.4s
+	add	w7,w7,w12
+	eor	v3.16b,v3.16b,v0.16b
+	add	w8,w8,w9
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w5
+	eor	v19.16b,v19.16b,v16.16b
+	eor	w17,w17,w6
+	rev32	v3.8h,v3.8h
+	eor	w19,w19,w7
+	rev32	v7.8h,v7.8h
+	eor	w20,w20,w8
+	rev32	v19.8h,v19.8h
+	ror	w21,w21,#16
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#16
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#16
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#16
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#20
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#20
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#20
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#12
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#12
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#12
+	ror	w10,w10,#20
+	add	v0.4s,v0.4s,v1.4s
+	ror	w11,w11,#20
+	add	v4.4s,v4.4s,v5.4s
+	ror	w12,w12,#20
+	add	v16.4s,v16.4s,v17.4s
+	ror	w9,w9,#20
+	eor	v20.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v21.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v22.16b,v19.16b,v16.16b
+	add	w7,w7,w12
+	ushr	v3.4s,v20.4s,#24
+	add	w8,w8,w9
+	ushr	v7.4s,v21.4s,#24
+	eor	w21,w21,w5
+	ushr	v19.4s,v22.4s,#24
+	eor	w17,w17,w6
+	sli	v3.4s,v20.4s,#8
+	eor	w19,w19,w7
+	sli	v7.4s,v21.4s,#8
+	eor	w20,w20,w8
+	sli	v19.4s,v22.4s,#8
+	ror	w21,w21,#24
+	add	v2.4s,v2.4s,v3.4s
+	ror	w17,w17,#24
+	add	v6.4s,v6.4s,v7.4s
+	ror	w19,w19,#24
+	add	v18.4s,v18.4s,v19.4s
+	ror	w20,w20,#24
+	eor	v20.16b,v1.16b,v2.16b
+	add	w15,w15,w21
+	eor	v21.16b,v5.16b,v6.16b
+	add	w16,w16,w17
+	eor	v22.16b,v17.16b,v18.16b
+	add	w13,w13,w19
+	ushr	v1.4s,v20.4s,#25
+	add	w14,w14,w20
+	ushr	v5.4s,v21.4s,#25
+	eor	w10,w10,w15
+	ushr	v17.4s,v22.4s,#25
+	eor	w11,w11,w16
+	sli	v1.4s,v20.4s,#7
+	eor	w12,w12,w13
+	sli	v5.4s,v21.4s,#7
+	eor	w9,w9,w14
+	sli	v17.4s,v22.4s,#7
+	ror	w10,w10,#25
+	ext	v2.16b,v2.16b,v2.16b,#8
+	ror	w11,w11,#25
+	ext	v6.16b,v6.16b,v6.16b,#8
+	ror	w12,w12,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	cbnz	x4,Loop_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	v0.4s,v0.4s,v24.4s
+	add	x6,x6,x22,lsr#32
+	add	v4.4s,v4.4s,v24.4s
+	add	w7,w7,w23
+	add	v16.4s,v16.4s,v24.4s
+	add	x8,x8,x23,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w9,w9,w24
+	add	v6.4s,v6.4s,v26.4s
+	add	x10,x10,x24,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w11,w11,w25
+	add	v3.4s,v3.4s,v27.4s
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	v7.4s,v7.4s,v28.4s
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	v19.4s,v19.4s,v29.4s
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	v1.4s,v1.4s,v25.4s
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	v5.4s,v5.4s,v25.4s
+	add	x21,x21,x30,lsr#32
+	add	v17.4s,v17.4s,v25.4s
+
+	b.lo	Ltail_neon
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v20.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v21.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v22.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v23.16b
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	add	v27.4s,v27.4s,v31.4s		// += 4
+	stp	x13,x15,[x0,#32]
+	add	v28.4s,v28.4s,v31.4s
+	stp	x17,x20,[x0,#48]
+	add	v29.4s,v29.4s,v31.4s
+	add	x0,x0,#64
+
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	eor	v16.16b,v16.16b,v0.16b
+	eor	v17.16b,v17.16b,v1.16b
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v19.16b,v19.16b,v3.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	b.hi	Loop_outer_neon
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+Ltail_neon:
+	add	x2,x2,#256
+	cmp	x2,#64
+	b.lo	Less_than_64
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#4			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_128
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v0.16b,v0.16b,v20.16b
+	eor	v1.16b,v1.16b,v21.16b
+	eor	v2.16b,v2.16b,v22.16b
+	eor	v3.16b,v3.16b,v23.16b
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+	cmp	x2,#64
+	b.lo	Less_than_192
+
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+	eor	v4.16b,v4.16b,v20.16b
+	eor	v5.16b,v5.16b,v21.16b
+	eor	v6.16b,v6.16b,v22.16b
+	eor	v7.16b,v7.16b,v23.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+	b.eq	Ldone_neon
+	sub	x2,x2,#64
+
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+	b	Last_neon
+
+Less_than_128:
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+	b	Last_neon
+Less_than_192:
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+	b	Last_neon
+
+.align	4
+Last_neon:
+	sub	x0,x0,#1
+	add	x1,x1,x2
+	add	x0,x0,x2
+	add	x4,sp,x2
+	neg	x2,x2
+
+Loop_tail_neon:
+	ldrb	w10,[x1,x2]
+	ldrb	w11,[x4,x2]
+	add	x2,x2,#1
+	eor	w10,w10,w11
+	strb	w10,[x0,x2]
+	cbnz	x2,Loop_tail_neon
+
+	stp	xzr,xzr,[sp,#0]
+	stp	xzr,xzr,[sp,#16]
+	stp	xzr,xzr,[sp,#32]
+	stp	xzr,xzr,[sp,#48]
+
+Ldone_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+ChaCha20_512_neon:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+
+	adrp	x5,Lsigma@PAGE
+	add	x5,x5,Lsigma@PAGEOFF
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+L512_or_more_neon:
+	sub	sp,sp,#128+64
+
+	ldp	x22,x23,[x5]		// load sigma
+	ld1	{v24.4s},[x5],#16
+	ldp	x24,x25,[x3]		// load key
+	ldp	x26,x27,[x3,#16]
+	ld1	{v25.4s,v26.4s},[x3]
+	ldp	x28,x30,[x4]		// load counter
+	ld1	{v27.4s},[x4]
+	ld1	{v31.4s},[x5]
+#ifdef	__AARCH64EB__
+	rev64	v24.4s,v24.4s
+	ror	x24,x24,#32
+	ror	x25,x25,#32
+	ror	x26,x26,#32
+	ror	x27,x27,#32
+	ror	x28,x28,#32
+	ror	x30,x30,#32
+#endif
+	add	v27.4s,v27.4s,v31.4s		// += 1
+	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
+	add	v27.4s,v27.4s,v31.4s		// not typo
+	str	q26,[sp,#32]
+	add	v28.4s,v27.4s,v31.4s
+	add	v29.4s,v28.4s,v31.4s
+	add	v30.4s,v29.4s,v31.4s
+	shl	v31.4s,v31.4s,#2			// 1 -> 4
+
+	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	stp	d10,d11,[sp,#128+16]
+	stp	d12,d13,[sp,#128+32]
+	stp	d14,d15,[sp,#128+48]
+
+	sub	x2,x2,#512			// not typo
+
+Loop_outer_512_neon:
+	mov	v0.16b,v24.16b
+	mov	v4.16b,v24.16b
+	mov	v8.16b,v24.16b
+	mov	v12.16b,v24.16b
+	mov	v16.16b,v24.16b
+	mov	v20.16b,v24.16b
+	mov	v1.16b,v25.16b
+	mov	w5,w22			// unpack key block
+	mov	v5.16b,v25.16b
+	lsr	x6,x22,#32
+	mov	v9.16b,v25.16b
+	mov	w7,w23
+	mov	v13.16b,v25.16b
+	lsr	x8,x23,#32
+	mov	v17.16b,v25.16b
+	mov	w9,w24
+	mov	v21.16b,v25.16b
+	lsr	x10,x24,#32
+	mov	v3.16b,v27.16b
+	mov	w11,w25
+	mov	v7.16b,v28.16b
+	lsr	x12,x25,#32
+	mov	v11.16b,v29.16b
+	mov	w13,w26
+	mov	v15.16b,v30.16b
+	lsr	x14,x26,#32
+	mov	v2.16b,v26.16b
+	mov	w15,w27
+	mov	v6.16b,v26.16b
+	lsr	x16,x27,#32
+	add	v19.4s,v3.4s,v31.4s			// +4
+	mov	w17,w28
+	add	v23.4s,v7.4s,v31.4s			// +4
+	lsr	x19,x28,#32
+	mov	v10.16b,v26.16b
+	mov	w20,w30
+	mov	v14.16b,v26.16b
+	lsr	x21,x30,#32
+	mov	v18.16b,v26.16b
+	stp	q27,q28,[sp,#48]		// off-load key block, variable part
+	mov	v22.16b,v26.16b
+	str	q29,[sp,#80]
+
+	mov	x4,#5
+	subs	x2,x2,#512
+Loop_upper_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_upper_neon
+
+	add	w5,w5,w22		// accumulate key block
+	add	x6,x6,x22,lsr#32
+	add	w7,w7,w23
+	add	x8,x8,x23,lsr#32
+	add	w9,w9,w24
+	add	x10,x10,x24,lsr#32
+	add	w11,w11,w25
+	add	x12,x12,x25,lsr#32
+	add	w13,w13,w26
+	add	x14,x14,x26,lsr#32
+	add	w15,w15,w27
+	add	x16,x16,x27,lsr#32
+	add	w17,w17,w28
+	add	x19,x19,x28,lsr#32
+	add	w20,w20,w30
+	add	x21,x21,x30,lsr#32
+
+	add	x5,x5,x6,lsl#32	// pack
+	add	x7,x7,x8,lsl#32
+	ldp	x6,x8,[x1,#0]		// load input
+	add	x9,x9,x10,lsl#32
+	add	x11,x11,x12,lsl#32
+	ldp	x10,x12,[x1,#16]
+	add	x13,x13,x14,lsl#32
+	add	x15,x15,x16,lsl#32
+	ldp	x14,x16,[x1,#32]
+	add	x17,x17,x19,lsl#32
+	add	x20,x20,x21,lsl#32
+	ldp	x19,x21,[x1,#48]
+	add	x1,x1,#64
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	x15,x15,x16
+	eor	x17,x17,x19
+	eor	x20,x20,x21
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#1			// increment counter
+	mov	w5,w22			// unpack key block
+	lsr	x6,x22,#32
+	stp	x9,x11,[x0,#16]
+	mov	w7,w23
+	lsr	x8,x23,#32
+	stp	x13,x15,[x0,#32]
+	mov	w9,w24
+	lsr	x10,x24,#32
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	mov	w11,w25
+	lsr	x12,x25,#32
+	mov	w13,w26
+	lsr	x14,x26,#32
+	mov	w15,w27
+	lsr	x16,x27,#32
+	mov	w17,w28
+	lsr	x19,x28,#32
+	mov	w20,w30
+	lsr	x21,x30,#32
+
+	mov	x4,#5
+Loop_lower_neon:
+	sub	x4,x4,#1
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#12
+	ext	v7.16b,v7.16b,v7.16b,#12
+	ext	v11.16b,v11.16b,v11.16b,#12
+	ext	v15.16b,v15.16b,v15.16b,#12
+	ext	v19.16b,v19.16b,v19.16b,#12
+	ext	v23.16b,v23.16b,v23.16b,#12
+	ext	v1.16b,v1.16b,v1.16b,#4
+	ext	v5.16b,v5.16b,v5.16b,#4
+	ext	v9.16b,v9.16b,v9.16b,#4
+	ext	v13.16b,v13.16b,v13.16b,#4
+	ext	v17.16b,v17.16b,v17.16b,#4
+	ext	v21.16b,v21.16b,v21.16b,#4
+	add	v0.4s,v0.4s,v1.4s
+	add	w5,w5,w9
+	add	v4.4s,v4.4s,v5.4s
+	add	w6,w6,w10
+	add	v8.4s,v8.4s,v9.4s
+	add	w7,w7,w11
+	add	v12.4s,v12.4s,v13.4s
+	add	w8,w8,w12
+	add	v16.4s,v16.4s,v17.4s
+	eor	w17,w17,w5
+	add	v20.4s,v20.4s,v21.4s
+	eor	w19,w19,w6
+	eor	v3.16b,v3.16b,v0.16b
+	eor	w20,w20,w7
+	eor	v7.16b,v7.16b,v4.16b
+	eor	w21,w21,w8
+	eor	v11.16b,v11.16b,v8.16b
+	ror	w17,w17,#16
+	eor	v15.16b,v15.16b,v12.16b
+	ror	w19,w19,#16
+	eor	v19.16b,v19.16b,v16.16b
+	ror	w20,w20,#16
+	eor	v23.16b,v23.16b,v20.16b
+	ror	w21,w21,#16
+	rev32	v3.8h,v3.8h
+	add	w13,w13,w17
+	rev32	v7.8h,v7.8h
+	add	w14,w14,w19
+	rev32	v11.8h,v11.8h
+	add	w15,w15,w20
+	rev32	v15.8h,v15.8h
+	add	w16,w16,w21
+	rev32	v19.8h,v19.8h
+	eor	w9,w9,w13
+	rev32	v23.8h,v23.8h
+	eor	w10,w10,w14
+	add	v2.4s,v2.4s,v3.4s
+	eor	w11,w11,w15
+	add	v6.4s,v6.4s,v7.4s
+	eor	w12,w12,w16
+	add	v10.4s,v10.4s,v11.4s
+	ror	w9,w9,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w10,w10,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w11,w11,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w12,w12,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w9
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w10
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w11
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w12
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w17,w17,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w19,w19,w6
+	ushr	v1.4s,v24.4s,#20
+	eor	w20,w20,w7
+	ushr	v5.4s,v25.4s,#20
+	eor	w21,w21,w8
+	ushr	v9.4s,v26.4s,#20
+	ror	w17,w17,#24
+	ushr	v13.4s,v27.4s,#20
+	ror	w19,w19,#24
+	ushr	v17.4s,v28.4s,#20
+	ror	w20,w20,#24
+	ushr	v21.4s,v29.4s,#20
+	ror	w21,w21,#24
+	sli	v1.4s,v24.4s,#12
+	add	w13,w13,w17
+	sli	v5.4s,v25.4s,#12
+	add	w14,w14,w19
+	sli	v9.4s,v26.4s,#12
+	add	w15,w15,w20
+	sli	v13.4s,v27.4s,#12
+	add	w16,w16,w21
+	sli	v17.4s,v28.4s,#12
+	eor	w9,w9,w13
+	sli	v21.4s,v29.4s,#12
+	eor	w10,w10,w14
+	add	v0.4s,v0.4s,v1.4s
+	eor	w11,w11,w15
+	add	v4.4s,v4.4s,v5.4s
+	eor	w12,w12,w16
+	add	v8.4s,v8.4s,v9.4s
+	ror	w9,w9,#25
+	add	v12.4s,v12.4s,v13.4s
+	ror	w10,w10,#25
+	add	v16.4s,v16.4s,v17.4s
+	ror	w11,w11,#25
+	add	v20.4s,v20.4s,v21.4s
+	ror	w12,w12,#25
+	eor	v24.16b,v3.16b,v0.16b
+	add	w5,w5,w10
+	eor	v25.16b,v7.16b,v4.16b
+	add	w6,w6,w11
+	eor	v26.16b,v11.16b,v8.16b
+	add	w7,w7,w12
+	eor	v27.16b,v15.16b,v12.16b
+	add	w8,w8,w9
+	eor	v28.16b,v19.16b,v16.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v23.16b,v20.16b
+	eor	w17,w17,w6
+	ushr	v3.4s,v24.4s,#24
+	eor	w19,w19,w7
+	ushr	v7.4s,v25.4s,#24
+	eor	w20,w20,w8
+	ushr	v11.4s,v26.4s,#24
+	ror	w21,w21,#16
+	ushr	v15.4s,v27.4s,#24
+	ror	w17,w17,#16
+	ushr	v19.4s,v28.4s,#24
+	ror	w19,w19,#16
+	ushr	v23.4s,v29.4s,#24
+	ror	w20,w20,#16
+	sli	v3.4s,v24.4s,#8
+	add	w15,w15,w21
+	sli	v7.4s,v25.4s,#8
+	add	w16,w16,w17
+	sli	v11.4s,v26.4s,#8
+	add	w13,w13,w19
+	sli	v15.4s,v27.4s,#8
+	add	w14,w14,w20
+	sli	v19.4s,v28.4s,#8
+	eor	w10,w10,w15
+	sli	v23.4s,v29.4s,#8
+	eor	w11,w11,w16
+	add	v2.4s,v2.4s,v3.4s
+	eor	w12,w12,w13
+	add	v6.4s,v6.4s,v7.4s
+	eor	w9,w9,w14
+	add	v10.4s,v10.4s,v11.4s
+	ror	w10,w10,#20
+	add	v14.4s,v14.4s,v15.4s
+	ror	w11,w11,#20
+	add	v18.4s,v18.4s,v19.4s
+	ror	w12,w12,#20
+	add	v22.4s,v22.4s,v23.4s
+	ror	w9,w9,#20
+	eor	v24.16b,v1.16b,v2.16b
+	add	w5,w5,w10
+	eor	v25.16b,v5.16b,v6.16b
+	add	w6,w6,w11
+	eor	v26.16b,v9.16b,v10.16b
+	add	w7,w7,w12
+	eor	v27.16b,v13.16b,v14.16b
+	add	w8,w8,w9
+	eor	v28.16b,v17.16b,v18.16b
+	eor	w21,w21,w5
+	eor	v29.16b,v21.16b,v22.16b
+	eor	w17,w17,w6
+	ushr	v1.4s,v24.4s,#25
+	eor	w19,w19,w7
+	ushr	v5.4s,v25.4s,#25
+	eor	w20,w20,w8
+	ushr	v9.4s,v26.4s,#25
+	ror	w21,w21,#24
+	ushr	v13.4s,v27.4s,#25
+	ror	w17,w17,#24
+	ushr	v17.4s,v28.4s,#25
+	ror	w19,w19,#24
+	ushr	v21.4s,v29.4s,#25
+	ror	w20,w20,#24
+	sli	v1.4s,v24.4s,#7
+	add	w15,w15,w21
+	sli	v5.4s,v25.4s,#7
+	add	w16,w16,w17
+	sli	v9.4s,v26.4s,#7
+	add	w13,w13,w19
+	sli	v13.4s,v27.4s,#7
+	add	w14,w14,w20
+	sli	v17.4s,v28.4s,#7
+	eor	w10,w10,w15
+	sli	v21.4s,v29.4s,#7
+	eor	w11,w11,w16
+	ext	v2.16b,v2.16b,v2.16b,#8
+	eor	w12,w12,w13
+	ext	v6.16b,v6.16b,v6.16b,#8
+	eor	w9,w9,w14
+	ext	v10.16b,v10.16b,v10.16b,#8
+	ror	w10,w10,#25
+	ext	v14.16b,v14.16b,v14.16b,#8
+	ror	w11,w11,#25
+	ext	v18.16b,v18.16b,v18.16b,#8
+	ror	w12,w12,#25
+	ext	v22.16b,v22.16b,v22.16b,#8
+	ror	w9,w9,#25
+	ext	v3.16b,v3.16b,v3.16b,#4
+	ext	v7.16b,v7.16b,v7.16b,#4
+	ext	v11.16b,v11.16b,v11.16b,#4
+	ext	v15.16b,v15.16b,v15.16b,#4
+	ext	v19.16b,v19.16b,v19.16b,#4
+	ext	v23.16b,v23.16b,v23.16b,#4
+	ext	v1.16b,v1.16b,v1.16b,#12
+	ext	v5.16b,v5.16b,v5.16b,#12
+	ext	v9.16b,v9.16b,v9.16b,#12
+	ext	v13.16b,v13.16b,v13.16b,#12
+	ext	v17.16b,v17.16b,v17.16b,#12
+	ext	v21.16b,v21.16b,v21.16b,#12
+	cbnz	x4,Loop_lower_neon
+
+	add	w5,w5,w22		// accumulate key block
+	ldp	q24,q25,[sp,#0]
+	add	x6,x6,x22,lsr#32
+	ldp	q26,q27,[sp,#32]
+	add	w7,w7,w23
+	ldp	q28,q29,[sp,#64]
+	add	x8,x8,x23,lsr#32
+	add	v0.4s,v0.4s,v24.4s
+	add	w9,w9,w24
+	add	v4.4s,v4.4s,v24.4s
+	add	x10,x10,x24,lsr#32
+	add	v8.4s,v8.4s,v24.4s
+	add	w11,w11,w25
+	add	v12.4s,v12.4s,v24.4s
+	add	x12,x12,x25,lsr#32
+	add	v16.4s,v16.4s,v24.4s
+	add	w13,w13,w26
+	add	v20.4s,v20.4s,v24.4s
+	add	x14,x14,x26,lsr#32
+	add	v2.4s,v2.4s,v26.4s
+	add	w15,w15,w27
+	add	v6.4s,v6.4s,v26.4s
+	add	x16,x16,x27,lsr#32
+	add	v10.4s,v10.4s,v26.4s
+	add	w17,w17,w28
+	add	v14.4s,v14.4s,v26.4s
+	add	x19,x19,x28,lsr#32
+	add	v18.4s,v18.4s,v26.4s
+	add	w20,w20,w30
+	add	v22.4s,v22.4s,v26.4s
+	add	x21,x21,x30,lsr#32
+	add	v19.4s,v19.4s,v31.4s			// +4
+	add	x5,x5,x6,lsl#32	// pack
+	add	v23.4s,v23.4s,v31.4s			// +4
+	add	x7,x7,x8,lsl#32
+	add	v3.4s,v3.4s,v27.4s
+	ldp	x6,x8,[x1,#0]		// load input
+	add	v7.4s,v7.4s,v28.4s
+	add	x9,x9,x10,lsl#32
+	add	v11.4s,v11.4s,v29.4s
+	add	x11,x11,x12,lsl#32
+	add	v15.4s,v15.4s,v30.4s
+	ldp	x10,x12,[x1,#16]
+	add	v19.4s,v19.4s,v27.4s
+	add	x13,x13,x14,lsl#32
+	add	v23.4s,v23.4s,v28.4s
+	add	x15,x15,x16,lsl#32
+	add	v1.4s,v1.4s,v25.4s
+	ldp	x14,x16,[x1,#32]
+	add	v5.4s,v5.4s,v25.4s
+	add	x17,x17,x19,lsl#32
+	add	v9.4s,v9.4s,v25.4s
+	add	x20,x20,x21,lsl#32
+	add	v13.4s,v13.4s,v25.4s
+	ldp	x19,x21,[x1,#48]
+	add	v17.4s,v17.4s,v25.4s
+	add	x1,x1,#64
+	add	v21.4s,v21.4s,v25.4s
+
+#ifdef	__AARCH64EB__
+	rev	x5,x5
+	rev	x7,x7
+	rev	x9,x9
+	rev	x11,x11
+	rev	x13,x13
+	rev	x15,x15
+	rev	x17,x17
+	rev	x20,x20
+#endif
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+	eor	x5,x5,x6
+	eor	x7,x7,x8
+	eor	x9,x9,x10
+	eor	x11,x11,x12
+	eor	x13,x13,x14
+	eor	v0.16b,v0.16b,v24.16b
+	eor	x15,x15,x16
+	eor	v1.16b,v1.16b,v25.16b
+	eor	x17,x17,x19
+	eor	v2.16b,v2.16b,v26.16b
+	eor	x20,x20,x21
+	eor	v3.16b,v3.16b,v27.16b
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+	stp	x5,x7,[x0,#0]		// store output
+	add	x28,x28,#7			// increment counter
+	stp	x9,x11,[x0,#16]
+	stp	x13,x15,[x0,#32]
+	stp	x17,x20,[x0,#48]
+	add	x0,x0,#64
+	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+	eor	v4.16b,v4.16b,v24.16b
+	eor	v5.16b,v5.16b,v25.16b
+	eor	v6.16b,v6.16b,v26.16b
+	eor	v7.16b,v7.16b,v27.16b
+	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	eor	v8.16b,v8.16b,v0.16b
+	ldp	q24,q25,[sp,#0]
+	eor	v9.16b,v9.16b,v1.16b
+	ldp	q26,q27,[sp,#32]
+	eor	v10.16b,v10.16b,v2.16b
+	eor	v11.16b,v11.16b,v3.16b
+	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+	eor	v12.16b,v12.16b,v4.16b
+	eor	v13.16b,v13.16b,v5.16b
+	eor	v14.16b,v14.16b,v6.16b
+	eor	v15.16b,v15.16b,v7.16b
+	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+	eor	v16.16b,v16.16b,v8.16b
+	eor	v17.16b,v17.16b,v9.16b
+	eor	v18.16b,v18.16b,v10.16b
+	eor	v19.16b,v19.16b,v11.16b
+	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+	shl	v0.4s,v31.4s,#1			// 4 -> 8
+	eor	v20.16b,v20.16b,v12.16b
+	eor	v21.16b,v21.16b,v13.16b
+	eor	v22.16b,v22.16b,v14.16b
+	eor	v23.16b,v23.16b,v15.16b
+	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+	add	v27.4s,v27.4s,v0.4s			// += 8
+	add	v28.4s,v28.4s,v0.4s
+	add	v29.4s,v29.4s,v0.4s
+	add	v30.4s,v30.4s,v0.4s
+
+	b.hs	Loop_outer_512_neon
+
+	adds	x2,x2,#512
+	ushr	v0.4s,v31.4s,#2			// 4 -> 1
+
+	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
+	ldp	d10,d11,[sp,#128+16]
+	ldp	d12,d13,[sp,#128+32]
+	ldp	d14,d15,[sp,#128+48]
+
+	stp	q24,q31,[sp,#0]		// wipe off-load area
+	stp	q24,q31,[sp,#32]
+	stp	q24,q31,[sp,#64]
+
+	b.eq	Ldone_512_neon
+
+	cmp	x2,#192
+	sub	v27.4s,v27.4s,v0.4s			// -= 1
+	sub	v28.4s,v28.4s,v0.4s
+	sub	v29.4s,v29.4s,v0.4s
+	add	sp,sp,#128
+	b.hs	Loop_outer_neon
+
+	eor	v25.16b,v25.16b,v25.16b
+	eor	v26.16b,v26.16b,v26.16b
+	eor	v27.16b,v27.16b,v27.16b
+	eor	v28.16b,v28.16b,v28.16b
+	eor	v29.16b,v29.16b,v29.16b
+	eor	v30.16b,v30.16b,v30.16b
+	b	Loop_outer
+
+Ldone_512_neon:
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#128+64
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#96
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S b/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
new file mode 100644
index 0000000..50d7dea
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -0,0 +1,799 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.section	__TEXT,__const
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	mov	x3,#-1
+	cmp	x0,#0
+	b.eq	Lenc_key_abort
+	cmp	x2,#0
+	b.eq	Lenc_key_abort
+	mov	x3,#-2
+	cmp	w1,#128
+	b.lt	Lenc_key_abort
+	cmp	w1,#256
+	b.gt	Lenc_key_abort
+	tst	w1,#0x3f
+	b.ne	Lenc_key_abort
+
+	adrp	x3,Lrcon@PAGE
+	add	x3,x3,Lrcon@PAGEOFF
+	cmp	w1,#192
+
+	eor	v0.16b,v0.16b,v0.16b
+	ld1	{v3.16b},[x0],#16
+	mov	w1,#8		// reuse w1
+	ld1	{v1.4s,v2.4s},[x3],#32
+
+	b.lt	Loop128
+	b.eq	L192
+	b	L256
+
+.align	4
+Loop128:
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	b.ne	Loop128
+
+	ld1	{v1.4s},[x3]
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+
+	tbl	v6.16b,{v3.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v3.4s},[x2],#16
+	aese	v6.16b,v0.16b
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2]
+	add	x2,x2,#0x50
+
+	mov	w12,#10
+	b	Ldone
+
+.align	4
+L192:
+	ld1	{v4.8b},[x0],#8
+	movi	v6.16b,#8			// borrow v6.16b
+	st1	{v3.4s},[x2],#16
+	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
+
+Loop192:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.8b},[x2],#8
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+
+	dup	v5.4s,v3.s[3]
+	eor	v5.16b,v5.16b,v4.16b
+	eor	v6.16b,v6.16b,v1.16b
+	ext	v4.16b,v0.16b,v4.16b,#12
+	shl	v1.16b,v1.16b,#1
+	eor	v4.16b,v4.16b,v5.16b
+	eor	v3.16b,v3.16b,v6.16b
+	eor	v4.16b,v4.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.ne	Loop192
+
+	mov	w12,#12
+	add	x2,x2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	ld1	{v4.16b},[x0]
+	mov	w1,#7
+	mov	w12,#14
+	st1	{v3.4s},[x2],#16
+
+Loop256:
+	tbl	v6.16b,{v4.16b},v2.16b
+	ext	v5.16b,v0.16b,v3.16b,#12
+	st1	{v4.4s},[x2],#16
+	aese	v6.16b,v0.16b
+	subs	w1,w1,#1
+
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v3.16b,v3.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v6.16b,v6.16b,v1.16b
+	eor	v3.16b,v3.16b,v5.16b
+	shl	v1.16b,v1.16b,#1
+	eor	v3.16b,v3.16b,v6.16b
+	st1	{v3.4s},[x2],#16
+	b.eq	Ldone
+
+	dup	v6.4s,v3.s[3]		// just splat
+	ext	v5.16b,v0.16b,v4.16b,#12
+	aese	v6.16b,v0.16b
+
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+	ext	v5.16b,v0.16b,v5.16b,#12
+	eor	v4.16b,v4.16b,v5.16b
+
+	eor	v4.16b,v4.16b,v6.16b
+	b	Loop256
+
+Ldone:
+	str	w12,[x2]
+	mov	x3,#0
+
+Lenc_key_abort:
+	mov	x0,x3			// return value
+	ldr	x29,[sp],#16
+	ret
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+
+.align	5
+_aes_hw_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	bl	Lenc_key
+
+	cmp	x0,#0
+	b.ne	Ldec_key_abort
+
+	sub	x2,x2,#240		// restore original x2
+	mov	x4,#-16
+	add	x0,x2,x12,lsl#4	// end of key schedule
+
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+
+Loop_imc:
+	ld1	{v0.4s},[x2]
+	ld1	{v1.4s},[x0]
+	aesimc	v0.16b,v0.16b
+	aesimc	v1.16b,v1.16b
+	st1	{v0.4s},[x0],x4
+	st1	{v1.4s},[x2],#16
+	cmp	x0,x2
+	b.hi	Loop_imc
+
+	ld1	{v0.4s},[x2]
+	aesimc	v0.16b,v0.16b
+	st1	{v0.4s},[x0]
+
+	eor	x0,x0,x0		// return value
+Ldec_key_abort:
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+
+.align	5
+_aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_enc:
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aese	v2.16b,v1.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_enc
+
+	aese	v2.16b,v0.16b
+	aesmc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aese	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+
+.align	5
+_aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	w3,[x2,#240]
+	ld1	{v0.4s},[x2],#16
+	ld1	{v2.16b},[x0]
+	sub	w3,w3,#2
+	ld1	{v1.4s},[x2],#16
+
+Loop_dec:
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2],#16
+	subs	w3,w3,#2
+	aesd	v2.16b,v1.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v1.4s},[x2],#16
+	b.gt	Loop_dec
+
+	aesd	v2.16b,v0.16b
+	aesimc	v2.16b,v2.16b
+	ld1	{v0.4s},[x2]
+	aesd	v2.16b,v1.16b
+	eor	v2.16b,v2.16b,v0.16b
+
+	st1	{v2.16b},[x1]
+	ret
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+
+.align	5
+_aes_hw_cbc_encrypt:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	subs	x2,x2,#16
+	mov	x8,#16
+	b.lo	Lcbc_abort
+	csel	x8,xzr,x8,eq
+
+	cmp	w5,#0			// en- or decrypting?
+	ldr	w5,[x3,#240]
+	and	x2,x2,#-16
+	ld1	{v6.16b},[x4]
+	ld1	{v0.16b},[x0],x8
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#6
+	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
+	sub	w5,w5,#2
+	ld1	{v18.4s,v19.4s},[x7],#32
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+
+	add	x7,x3,#32
+	mov	w6,w5
+	b.eq	Lcbc_dec
+
+	cmp	w5,#2
+	eor	v0.16b,v0.16b,v6.16b
+	eor	v5.16b,v16.16b,v7.16b
+	b.eq	Lcbc_enc128
+
+	ld1	{v2.4s,v3.4s},[x7]
+	add	x7,x3,#16
+	add	x6,x3,#16*4
+	add	x12,x3,#16*5
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	add	x14,x3,#16*6
+	add	x3,x3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x6]
+	cmp	w5,#4
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x12]
+	b.eq	Lcbc_enc192
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.4s},[x14]
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x3]
+	nop
+
+Lcbc_enc192:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	ld1	{v2.4s,v3.4s},[x7]
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	st1	{v6.16b},[x1],#16
+Lenter_cbc_enc128:
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	subs	x2,x2,#16
+	aese	v0.16b,v2.16b
+	aesmc	v0.16b,v0.16b
+	csel	x8,xzr,x8,eq
+	aese	v0.16b,v3.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v18.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v19.16b
+	aesmc	v0.16b,v0.16b
+	ld1	{v16.16b},[x0],x8
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	eor	v16.16b,v16.16b,v5.16b
+	aese	v0.16b,v23.16b
+	eor	v6.16b,v0.16b,v7.16b
+	b.hs	Loop_cbc_enc128
+
+	st1	{v6.16b},[x1],#16
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	ld1	{v18.16b},[x0],#16
+	subs	x2,x2,#32		// bias
+	add	w6,w5,#2
+	orr	v3.16b,v0.16b,v0.16b
+	orr	v1.16b,v0.16b,v0.16b
+	orr	v19.16b,v18.16b,v18.16b
+	b.lo	Lcbc_dec_tail
+
+	orr	v1.16b,v18.16b,v18.16b
+	ld1	{v18.16b},[x0],#16
+	orr	v2.16b,v0.16b,v0.16b
+	orr	v3.16b,v1.16b,v1.16b
+	orr	v19.16b,v18.16b,v18.16b
+
+Loop3x_cbc_dec:
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_cbc_dec
+
+	aesd	v0.16b,v16.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	eor	v4.16b,v6.16b,v7.16b
+	subs	x2,x2,#0x30
+	eor	v5.16b,v2.16b,v7.16b
+	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
+	aesd	v0.16b,v17.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	add	x0,x0,x6		// x0 is adjusted in such way that
+					// at exit from the loop v1.16b-v18.16b
+					// are loaded with last "words"
+	orr	v6.16b,v19.16b,v19.16b
+	mov	x7,x3
+	aesd	v0.16b,v20.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v2.16b},[x0],#16
+	aesd	v0.16b,v21.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	aesd	v0.16b,v22.16b
+	aesimc	v0.16b,v0.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v19.16b},[x0],#16
+	aesd	v0.16b,v23.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	add	w6,w5,#2
+	eor	v4.16b,v4.16b,v0.16b
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v18.16b,v18.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v4.16b},[x1],#16
+	orr	v0.16b,v2.16b,v2.16b
+	st1	{v5.16b},[x1],#16
+	orr	v1.16b,v3.16b,v3.16b
+	st1	{v18.16b},[x1],#16
+	orr	v18.16b,v19.16b,v19.16b
+	b.hs	Loop3x_cbc_dec
+
+	cmn	x2,#0x30
+	b.eq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lcbc_dec_tail
+
+	aesd	v1.16b,v16.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v16.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v17.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v17.16b
+	aesimc	v18.16b,v18.16b
+	aesd	v1.16b,v20.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v20.16b
+	aesimc	v18.16b,v18.16b
+	cmn	x2,#0x20
+	aesd	v1.16b,v21.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v21.16b
+	aesimc	v18.16b,v18.16b
+	eor	v5.16b,v6.16b,v7.16b
+	aesd	v1.16b,v22.16b
+	aesimc	v1.16b,v1.16b
+	aesd	v18.16b,v22.16b
+	aesimc	v18.16b,v18.16b
+	eor	v17.16b,v3.16b,v7.16b
+	aesd	v1.16b,v23.16b
+	aesd	v18.16b,v23.16b
+	b.eq	Lcbc_dec_one
+	eor	v5.16b,v5.16b,v1.16b
+	eor	v17.16b,v17.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+	st1	{v17.16b},[x1],#16
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	eor	v5.16b,v5.16b,v18.16b
+	orr	v6.16b,v19.16b,v19.16b
+	st1	{v5.16b},[x1],#16
+
+Lcbc_done:
+	st1	{v6.16b},[x4]
+Lcbc_abort:
+	ldr	x29,[sp],#16
+	ret
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	ldr	w5,[x3,#240]
+
+	ldr	w8, [x4, #12]
+	ld1	{v0.4s},[x4]
+
+	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
+	sub	w5,w5,#4
+	mov	x12,#16
+	cmp	x2,#2
+	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
+	sub	w5,w5,#2
+	ld1	{v20.4s,v21.4s},[x7],#32
+	ld1	{v22.4s,v23.4s},[x7],#32
+	ld1	{v7.4s},[x7]
+	add	x7,x3,#32
+	mov	w6,w5
+	csel	x12,xzr,x12,lo
+
+	// ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	// affected by silicon errata #1742098 [0] and #1655431 [1],
+	// respectively, where the second instruction of an aese/aesmc
+	// instruction pair may execute twice if an interrupt is taken right
+	// after the first instruction consumes an input register of which a
+	// single 32-bit lane has been updated the last time it was modified.
+	//
+	// This function uses a counter in one 32-bit lane. The vmov lines
+	// could write to v1.16b and v18.16b directly, but that trips this bugs.
+	// We write to v6.16b and copy to the final register as a workaround.
+	//
+	// [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	// [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __AARCH64EB__
+	rev	w8, w8
+#endif
+	add	w10, w8, #1
+	orr	v6.16b,v0.16b,v0.16b
+	rev	w10, w10
+	mov	v6.s[3],w10
+	add	w8, w8, #2
+	orr	v1.16b,v6.16b,v6.16b
+	b.ls	Lctr32_tail
+	rev	w12, w8
+	mov	v6.s[3],w12
+	sub	x2,x2,#3		// bias
+	orr	v18.16b,v6.16b,v6.16b
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	aese	v18.16b,v17.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Loop3x_ctr32
+
+	aese	v0.16b,v16.16b
+	aesmc	v4.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v5.16b,v1.16b
+	ld1	{v2.16b},[x0],#16
+	add	w9,w8,#1
+	aese	v18.16b,v16.16b
+	aesmc	v18.16b,v18.16b
+	ld1	{v3.16b},[x0],#16
+	rev	w9,w9
+	aese	v4.16b,v17.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v17.16b
+	aesmc	v5.16b,v5.16b
+	ld1	{v19.16b},[x0],#16
+	mov	x7,x3
+	aese	v18.16b,v17.16b
+	aesmc	v17.16b,v18.16b
+	aese	v4.16b,v20.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v20.16b
+	aesmc	v5.16b,v5.16b
+	eor	v2.16b,v2.16b,v7.16b
+	add	w10,w8,#2
+	aese	v17.16b,v20.16b
+	aesmc	v17.16b,v17.16b
+	eor	v3.16b,v3.16b,v7.16b
+	add	w8,w8,#3
+	aese	v4.16b,v21.16b
+	aesmc	v4.16b,v4.16b
+	aese	v5.16b,v21.16b
+	aesmc	v5.16b,v5.16b
+	 // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+	 // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 // 32-bit mode. See the comment above.
+	eor	v19.16b,v19.16b,v7.16b
+	mov	v6.s[3], w9
+	aese	v17.16b,v21.16b
+	aesmc	v17.16b,v17.16b
+	orr	v0.16b,v6.16b,v6.16b
+	rev	w10,w10
+	aese	v4.16b,v22.16b
+	aesmc	v4.16b,v4.16b
+	mov	v6.s[3], w10
+	rev	w12,w8
+	aese	v5.16b,v22.16b
+	aesmc	v5.16b,v5.16b
+	orr	v1.16b,v6.16b,v6.16b
+	mov	v6.s[3], w12
+	aese	v17.16b,v22.16b
+	aesmc	v17.16b,v17.16b
+	orr	v18.16b,v6.16b,v6.16b
+	subs	x2,x2,#3
+	aese	v4.16b,v23.16b
+	aese	v5.16b,v23.16b
+	aese	v17.16b,v23.16b
+
+	eor	v2.16b,v2.16b,v4.16b
+	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
+	st1	{v2.16b},[x1],#16
+	eor	v3.16b,v3.16b,v5.16b
+	mov	w6,w5
+	st1	{v3.16b},[x1],#16
+	eor	v19.16b,v19.16b,v17.16b
+	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
+	st1	{v19.16b},[x1],#16
+	b.hs	Loop3x_ctr32
+
+	adds	x2,x2,#3
+	b.eq	Lctr32_done
+	cmp	x2,#1
+	mov	x12,#16
+	csel	x12,xzr,x12,eq
+
+Lctr32_tail:
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v16.4s},[x7],#16
+	subs	w6,w6,#2
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v17.4s},[x7],#16
+	b.gt	Lctr32_tail
+
+	aese	v0.16b,v16.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v16.16b
+	aesmc	v1.16b,v1.16b
+	aese	v0.16b,v17.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v17.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v2.16b},[x0],x12
+	aese	v0.16b,v20.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v20.16b
+	aesmc	v1.16b,v1.16b
+	ld1	{v3.16b},[x0]
+	aese	v0.16b,v21.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v21.16b
+	aesmc	v1.16b,v1.16b
+	eor	v2.16b,v2.16b,v7.16b
+	aese	v0.16b,v22.16b
+	aesmc	v0.16b,v0.16b
+	aese	v1.16b,v22.16b
+	aesmc	v1.16b,v1.16b
+	eor	v3.16b,v3.16b,v7.16b
+	aese	v0.16b,v23.16b
+	aese	v1.16b,v23.16b
+
+	cmp	x2,#1
+	eor	v2.16b,v2.16b,v0.16b
+	eor	v3.16b,v3.16b,v1.16b
+	st1	{v2.16b},[x1],#16
+	b.eq	Lctr32_done
+	st1	{v3.16b},[x1]
+
+Lctr32_done:
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/armv8-mont.S b/apple-aarch64/crypto/fipsmodule/armv8-mont.S
new file mode 100644
index 0000000..2493ae0
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -0,0 +1,1433 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+
+.align	5
+_bn_mul_mont:
+	AARCH64_SIGN_LINK_REGISTER
+	tst	x5,#7
+	b.eq	__bn_sqr8x_mont
+	tst	x5,#3
+	b.eq	__bn_mul4x_mont
+Lmul_mont:
+	stp	x29,x30,[sp,#-64]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+
+	ldr	x9,[x2],#8		// bp[0]
+	sub	x22,sp,x5,lsl#3
+	ldp	x7,x8,[x1],#16	// ap[0..1]
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	and	x22,x22,#-16		// ABI says so
+	ldp	x13,x14,[x3],#16	// np[0..1]
+
+	mul	x6,x7,x9		// ap[0]*bp[0]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	mul	x10,x8,x9		// ap[1]*bp[0]
+	umulh	x11,x8,x9
+
+	mul	x15,x6,x4		// "tp[0]"*n0
+	mov	sp,x22			// alloca
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6	// discarded
+	// (*)	As for removal of first multiplication and addition
+	//	instructions. The outcome of first addition is
+	//	guaranteed to be zero, which leaves two computationally
+	//	significant outcomes: it either carries or not. Then
+	//	question is when does it carry? Is there alternative
+	//	way to deduce it? If you follow operations, you can
+	//	observe that condition for carry is quite simple:
+	//	x6 being non-zero. So that carry can be calculated
+	//	by adding -1 to x6. That's what next instruction does.
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	adc	x13,x13,xzr
+	cbz	x21,L1st_skip
+
+L1st:
+	ldr	x8,[x1],#8
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	ldr	x14,[x3],#8
+	adds	x12,x16,x13
+	mul	x10,x8,x9		// ap[j]*bp[0]
+	adc	x13,x17,xzr
+	umulh	x11,x8,x9
+
+	adds	x12,x12,x6
+	mul	x16,x14,x15		// np[j]*m1
+	adc	x13,x13,xzr
+	umulh	x17,x14,x15
+	str	x12,[x22],#8		// tp[j-1]
+	cbnz	x21,L1st
+
+L1st_skip:
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adc	x13,x17,xzr
+
+	adds	x12,x12,x6
+	sub	x20,x5,#8		// i=num-1
+	adcs	x13,x13,x7
+
+	adc	x19,xzr,xzr		// upmost overflow bit
+	stp	x12,x13,[x22]
+
+Louter:
+	ldr	x9,[x2],#8		// bp[i]
+	ldp	x7,x8,[x1],#16
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+
+	mul	x6,x7,x9		// ap[0]*bp[i]
+	sub	x21,x5,#16		// j=num-2
+	umulh	x7,x7,x9
+	ldp	x13,x14,[x3],#16
+	mul	x10,x8,x9		// ap[1]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x15,x6,x4
+	sub	x20,x20,#8		// i--
+
+	// (*)	mul	x12,x13,x15	// np[0]*m1
+	umulh	x13,x13,x15
+	mul	x16,x14,x15		// np[1]*m1
+	// (*)	adds	x12,x12,x6
+	subs	xzr,x6,#1		// (*)
+	umulh	x17,x14,x15
+	cbz	x21,Linner_skip
+
+Linner:
+	ldr	x8,[x1],#8
+	adc	x13,x13,xzr
+	ldr	x23,[x22],#8		// tp[j]
+	adds	x6,x10,x7
+	sub	x21,x21,#8		// j--
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	ldr	x14,[x3],#8
+	adc	x13,x17,xzr
+
+	mul	x10,x8,x9		// ap[j]*bp[i]
+	adds	x6,x6,x23
+	umulh	x11,x8,x9
+	adc	x7,x7,xzr
+
+	mul	x16,x14,x15		// np[j]*m1
+	adds	x12,x12,x6
+	umulh	x17,x14,x15
+	str	x12,[x22,#-16]		// tp[j-1]
+	cbnz	x21,Linner
+
+Linner_skip:
+	ldr	x23,[x22],#8		// tp[j]
+	adc	x13,x13,xzr
+	adds	x6,x10,x7
+	sub	x1,x1,x5		// rewind x1
+	adc	x7,x11,xzr
+
+	adds	x12,x16,x13
+	sub	x3,x3,x5		// rewind x3
+	adcs	x13,x17,x19
+	adc	x19,xzr,xzr
+
+	adds	x6,x6,x23
+	adc	x7,x7,xzr
+
+	adds	x12,x12,x6
+	adcs	x13,x13,x7
+	adc	x19,x19,xzr		// upmost overflow bit
+	stp	x12,x13,[x22,#-16]
+
+	cbnz	x20,Louter
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x14,[x3],#8		// np[0]
+	subs	x21,x5,#8		// j=num-1 and clear borrow
+	mov	x1,x0
+Lsub:
+	sbcs	x8,x23,x14		// tp[j]-np[j]
+	ldr	x23,[x22],#8
+	sub	x21,x21,#8		// j--
+	ldr	x14,[x3],#8
+	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
+	cbnz	x21,Lsub
+
+	sbcs	x8,x23,x14
+	sbcs	x19,x19,xzr		// did it borrow?
+	str	x8,[x1],#8		// rp[num-1]
+
+	ldr	x23,[sp]		// tp[0]
+	add	x22,sp,#8
+	ldr	x8,[x0],#8		// rp[0]
+	sub	x5,x5,#8		// num--
+	nop
+Lcond_copy:
+	sub	x5,x5,#8		// num--
+	csel	x14,x23,x8,lo		// did it borrow?
+	ldr	x23,[x22],#8
+	ldr	x8,[x0],#8
+	str	xzr,[x22,#-16]		// wipe tp
+	str	x14,[x0,#-16]
+	cbnz	x5,Lcond_copy
+
+	csel	x14,x23,x8,lo
+	str	xzr,[x22,#-8]		// wipe tp
+	str	x14,[x0,#-8]
+
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldr	x29,[sp],#64
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+__bn_sqr8x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+	// only from bn_mul_mont which has already signed the return address.
+	cmp	x1,x2
+	b.ne	__bn_mul4x_mont
+Lsqr8x_mont:
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	stp	x0,x3,[sp,#96]	// offload rp and np
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	ldp	x12,x13,[x1,#8*6]
+
+	sub	x2,sp,x5,lsl#4
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	mov	sp,x2			// alloca
+	sub	x27,x5,#8*8
+	b	Lsqr8x_zero_start
+
+Lsqr8x_zero:
+	sub	x27,x27,#8*8
+	stp	xzr,xzr,[x2,#8*0]
+	stp	xzr,xzr,[x2,#8*2]
+	stp	xzr,xzr,[x2,#8*4]
+	stp	xzr,xzr,[x2,#8*6]
+Lsqr8x_zero_start:
+	stp	xzr,xzr,[x2,#8*8]
+	stp	xzr,xzr,[x2,#8*10]
+	stp	xzr,xzr,[x2,#8*12]
+	stp	xzr,xzr,[x2,#8*14]
+	add	x2,x2,#8*16
+	cbnz	x27,Lsqr8x_zero
+
+	add	x3,x1,x5
+	add	x1,x1,#8*8
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	mov	x23,xzr
+	mov	x24,xzr
+	mov	x25,xzr
+	mov	x26,xzr
+	mov	x2,sp
+	str	x4,[x29,#112]		// offload n0
+
+	// Multiply everything but a[i]*a[i]
+.align	4
+Lsqr8x_outer_loop:
+        //                                                 a[1]a[0]	(i)
+        //                                             a[2]a[0]
+        //                                         a[3]a[0]
+        //                                     a[4]a[0]
+        //                                 a[5]a[0]
+        //                             a[6]a[0]
+        //                         a[7]a[0]
+        //                                         a[2]a[1]		(ii)
+        //                                     a[3]a[1]
+        //                                 a[4]a[1]
+        //                             a[5]a[1]
+        //                         a[6]a[1]
+        //                     a[7]a[1]
+        //                                 a[3]a[2]			(iii)
+        //                             a[4]a[2]
+        //                         a[5]a[2]
+        //                     a[6]a[2]
+        //                 a[7]a[2]
+        //                         a[4]a[3]				(iv)
+        //                     a[5]a[3]
+        //                 a[6]a[3]
+        //             a[7]a[3]
+        //                 a[5]a[4]					(v)
+        //             a[6]a[4]
+        //         a[7]a[4]
+        //         a[6]a[5]						(vi)
+        //     a[7]a[5]
+        // a[7]a[6]							(vii)
+
+	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
+	mul	x15,x8,x6
+	mul	x16,x9,x6
+	mul	x17,x10,x6
+	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
+	mul	x14,x11,x6
+	adcs	x21,x21,x15
+	mul	x15,x12,x6
+	adcs	x22,x22,x16
+	mul	x16,x13,x6
+	adcs	x23,x23,x17
+	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
+	adcs	x24,x24,x14
+	umulh	x14,x8,x6
+	adcs	x25,x25,x15
+	umulh	x15,x9,x6
+	adcs	x26,x26,x16
+	umulh	x16,x10,x6
+	stp	x19,x20,[x2],#8*2	// t[0..1]
+	adc	x19,xzr,xzr		// t[8]
+	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
+	umulh	x17,x11,x6
+	adcs	x22,x22,x14
+	umulh	x14,x12,x6
+	adcs	x23,x23,x15
+	umulh	x15,x13,x6
+	adcs	x24,x24,x16
+	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
+	adcs	x25,x25,x17
+	mul	x17,x9,x7
+	adcs	x26,x26,x14
+	mul	x14,x10,x7
+	adc	x19,x19,x15
+
+	mul	x15,x11,x7
+	adds	x22,x22,x16
+	mul	x16,x12,x7
+	adcs	x23,x23,x17
+	mul	x17,x13,x7
+	adcs	x24,x24,x14
+	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
+	adcs	x25,x25,x15
+	umulh	x15,x9,x7
+	adcs	x26,x26,x16
+	umulh	x16,x10,x7
+	adcs	x19,x19,x17
+	umulh	x17,x11,x7
+	stp	x21,x22,[x2],#8*2	// t[2..3]
+	adc	x20,xzr,xzr		// t[9]
+	adds	x23,x23,x14
+	umulh	x14,x12,x7
+	adcs	x24,x24,x15
+	umulh	x15,x13,x7
+	adcs	x25,x25,x16
+	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
+	adcs	x26,x26,x17
+	mul	x17,x10,x8
+	adcs	x19,x19,x14
+	mul	x14,x11,x8
+	adc	x20,x20,x15
+
+	mul	x15,x12,x8
+	adds	x24,x24,x16
+	mul	x16,x13,x8
+	adcs	x25,x25,x17
+	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
+	adcs	x26,x26,x14
+	umulh	x14,x10,x8
+	adcs	x19,x19,x15
+	umulh	x15,x11,x8
+	adcs	x20,x20,x16
+	umulh	x16,x12,x8
+	stp	x23,x24,[x2],#8*2	// t[4..5]
+	adc	x21,xzr,xzr		// t[10]
+	adds	x25,x25,x17
+	umulh	x17,x13,x8
+	adcs	x26,x26,x14
+	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
+	adcs	x19,x19,x15
+	mul	x15,x11,x9
+	adcs	x20,x20,x16
+	mul	x16,x12,x9
+	adc	x21,x21,x17
+
+	mul	x17,x13,x9
+	adds	x26,x26,x14
+	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
+	adcs	x19,x19,x15
+	umulh	x15,x11,x9
+	adcs	x20,x20,x16
+	umulh	x16,x12,x9
+	adcs	x21,x21,x17
+	umulh	x17,x13,x9
+	stp	x25,x26,[x2],#8*2	// t[6..7]
+	adc	x22,xzr,xzr		// t[11]
+	adds	x19,x19,x14
+	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
+	adcs	x20,x20,x15
+	mul	x15,x12,x10
+	adcs	x21,x21,x16
+	mul	x16,x13,x10
+	adc	x22,x22,x17
+
+	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
+	adds	x20,x20,x14
+	umulh	x14,x12,x10
+	adcs	x21,x21,x15
+	umulh	x15,x13,x10
+	adcs	x22,x22,x16
+	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
+	adc	x23,xzr,xzr		// t[12]
+	adds	x21,x21,x17
+	mul	x17,x13,x11
+	adcs	x22,x22,x14
+	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
+	adc	x23,x23,x15
+
+	umulh	x15,x13,x11
+	adds	x22,x22,x16
+	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
+	adcs	x23,x23,x17
+	umulh	x17,x13,x12		// hi(a[7]*a[6])
+	adc	x24,xzr,xzr		// t[13]
+	adds	x23,x23,x14
+	sub	x27,x3,x1	// done yet?
+	adc	x24,x24,x15
+
+	adds	x24,x24,x16
+	sub	x14,x3,x5	// rewinded ap
+	adc	x25,xzr,xzr		// t[14]
+	add	x25,x25,x17
+
+	cbz	x27,Lsqr8x_outer_break
+
+	mov	x4,x6
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x0,x1
+	adcs	x26,xzr,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved below
+	mov	x27,#-8*8
+
+	//                                                         a[8]a[0]
+	//                                                     a[9]a[0]
+	//                                                 a[a]a[0]
+	//                                             a[b]a[0]
+	//                                         a[c]a[0]
+	//                                     a[d]a[0]
+	//                                 a[e]a[0]
+	//                             a[f]a[0]
+	//                                                     a[8]a[1]
+	//                         a[f]a[1]........................
+	//                                                 a[8]a[2]
+	//                     a[f]a[2]........................
+	//                                             a[8]a[3]
+	//                 a[f]a[3]........................
+	//                                         a[8]a[4]
+	//             a[f]a[4]........................
+	//                                     a[8]a[5]
+	//         a[f]a[5]........................
+	//                                 a[8]a[6]
+	//     a[f]a[6]........................
+	//                             a[8]a[7]
+	// a[f]a[7]........................
+Lsqr8x_mul:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_mul
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	cmp	x1,x3		// done yet?
+	b.eq	Lsqr8x_break
+
+	ldp	x6,x7,[x2,#8*0]
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	adds	x19,x19,x6
+	ldr	x4,[x0,#-8*8]
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_mul
+
+.align	4
+Lsqr8x_break:
+	ldp	x6,x7,[x0,#8*0]
+	add	x1,x0,#8*8
+	ldp	x8,x9,[x0,#8*2]
+	sub	x14,x3,x1		// is it last iteration?
+	ldp	x10,x11,[x0,#8*4]
+	sub	x15,x2,x14
+	ldp	x12,x13,[x0,#8*6]
+	cbz	x14,Lsqr8x_outer_loop
+
+	stp	x19,x20,[x2,#8*0]
+	ldp	x19,x20,[x15,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x15,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x15,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x15
+	ldp	x25,x26,[x15,#8*6]
+	b	Lsqr8x_outer_loop
+
+.align	4
+Lsqr8x_outer_break:
+	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
+	ldp	x15,x16,[sp,#8*1]
+	ldp	x11,x13,[x14,#8*2]
+	add	x1,x14,#8*4
+	ldp	x17,x14,[sp,#8*3]
+
+	stp	x19,x20,[x2,#8*0]
+	mul	x19,x7,x7
+	stp	x21,x22,[x2,#8*2]
+	umulh	x7,x7,x7
+	stp	x23,x24,[x2,#8*4]
+	mul	x8,x9,x9
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,sp
+	umulh	x9,x9,x9
+	adds	x20,x7,x15,lsl#1
+	extr	x15,x16,x15,#63
+	sub	x27,x5,#8*4
+
+Lsqr4x_shift_n_add:
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	sub	x27,x27,#8*4
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	ldp	x7,x9,[x1],#8*2
+	umulh	x11,x11,x11
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	extr	x17,x14,x17,#63
+	stp	x19,x20,[x2,#8*0]
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	stp	x21,x22,[x2,#8*2]
+	adcs	x24,x11,x14
+	ldp	x17,x14,[x2,#8*7]
+	extr	x15,x16,x15,#63
+	adcs	x25,x12,x15
+	extr	x16,x17,x16,#63
+	adcs	x26,x13,x16
+	ldp	x15,x16,[x2,#8*9]
+	mul	x6,x7,x7
+	ldp	x11,x13,[x1],#8*2
+	umulh	x7,x7,x7
+	mul	x8,x9,x9
+	umulh	x9,x9,x9
+	stp	x23,x24,[x2,#8*4]
+	extr	x17,x14,x17,#63
+	stp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	adcs	x19,x6,x17
+	extr	x14,x15,x14,#63
+	adcs	x20,x7,x14
+	ldp	x17,x14,[x2,#8*3]
+	extr	x15,x16,x15,#63
+	cbnz	x27,Lsqr4x_shift_n_add
+	ldp	x1,x4,[x29,#104]	// pull np and n0
+
+	adcs	x21,x8,x15
+	extr	x16,x17,x16,#63
+	adcs	x22,x9,x16
+	ldp	x15,x16,[x2,#8*5]
+	mul	x10,x11,x11
+	umulh	x11,x11,x11
+	stp	x19,x20,[x2,#8*0]
+	mul	x12,x13,x13
+	umulh	x13,x13,x13
+	stp	x21,x22,[x2,#8*2]
+	extr	x17,x14,x17,#63
+	adcs	x23,x10,x17
+	extr	x14,x15,x14,#63
+	ldp	x19,x20,[sp,#8*0]
+	adcs	x24,x11,x14
+	extr	x15,x16,x15,#63
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x25,x12,x15
+	extr	x16,xzr,x16,#63
+	ldp	x8,x9,[x1,#8*2]
+	adc	x26,x13,x16
+	ldp	x10,x11,[x1,#8*4]
+
+	// Reduce by 512 bits per iteration
+	mul	x28,x4,x19		// t[0]*n0
+	ldp	x12,x13,[x1,#8*6]
+	add	x3,x1,x5
+	ldp	x21,x22,[sp,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[sp,#8*4]
+	stp	x25,x26,[x2,#8*6]
+	ldp	x25,x26,[sp,#8*6]
+	add	x1,x1,#8*8
+	mov	x30,xzr		// initial top-most carry
+	mov	x2,sp
+	mov	x27,#8
+
+Lsqr8x_reduction:
+	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
+	mul	x15,x7,x28
+	sub	x27,x27,#1
+	mul	x16,x8,x28
+	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
+	mul	x17,x9,x28
+	// (*)	adds	xzr,x19,x14
+	subs	xzr,x19,#1		// (*)
+	mul	x14,x10,x28
+	adcs	x19,x20,x15
+	mul	x15,x11,x28
+	adcs	x20,x21,x16
+	mul	x16,x12,x28
+	adcs	x21,x22,x17
+	mul	x17,x13,x28
+	adcs	x22,x23,x14
+	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
+	adcs	x23,x24,x15
+	umulh	x15,x7,x28
+	adcs	x24,x25,x16
+	umulh	x16,x8,x28
+	adcs	x25,x26,x17
+	umulh	x17,x9,x28
+	adc	x26,xzr,xzr
+	adds	x19,x19,x14
+	umulh	x14,x10,x28
+	adcs	x20,x20,x15
+	umulh	x15,x11,x28
+	adcs	x21,x21,x16
+	umulh	x16,x12,x28
+	adcs	x22,x22,x17
+	umulh	x17,x13,x28
+	mul	x28,x4,x19		// next t[0]*n0
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adc	x26,x26,x17
+	cbnz	x27,Lsqr8x_reduction
+
+	ldp	x14,x15,[x2,#8*0]
+	ldp	x16,x17,[x2,#8*2]
+	mov	x0,x2
+	sub	x27,x3,x1	// done yet?
+	adds	x19,x19,x14
+	adcs	x20,x20,x15
+	ldp	x14,x15,[x2,#8*4]
+	adcs	x21,x21,x16
+	adcs	x22,x22,x17
+	ldp	x16,x17,[x2,#8*6]
+	adcs	x23,x23,x14
+	adcs	x24,x24,x15
+	adcs	x25,x25,x16
+	adcs	x26,x26,x17
+	//adc	x28,xzr,xzr		// moved below
+	cbz	x27,Lsqr8x8_post_condition
+
+	ldr	x4,[x2,#-8*8]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	ldp	x10,x11,[x1,#8*4]
+	mov	x27,#-8*8
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+
+Lsqr8x_tail:
+	mul	x14,x6,x4
+	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
+	mul	x15,x7,x4
+	add	x27,x27,#8
+	mul	x16,x8,x4
+	mul	x17,x9,x4
+	adds	x19,x19,x14
+	mul	x14,x10,x4
+	adcs	x20,x20,x15
+	mul	x15,x11,x4
+	adcs	x21,x21,x16
+	mul	x16,x12,x4
+	adcs	x22,x22,x17
+	mul	x17,x13,x4
+	adcs	x23,x23,x14
+	umulh	x14,x6,x4
+	adcs	x24,x24,x15
+	umulh	x15,x7,x4
+	adcs	x25,x25,x16
+	umulh	x16,x8,x4
+	adcs	x26,x26,x17
+	umulh	x17,x9,x4
+	adc	x28,x28,xzr
+	str	x19,[x2],#8
+	adds	x19,x20,x14
+	umulh	x14,x10,x4
+	adcs	x20,x21,x15
+	umulh	x15,x11,x4
+	adcs	x21,x22,x16
+	umulh	x16,x12,x4
+	adcs	x22,x23,x17
+	umulh	x17,x13,x4
+	ldr	x4,[x0,x27]
+	adcs	x23,x24,x14
+	adcs	x24,x25,x15
+	adcs	x25,x26,x16
+	adcs	x26,x28,x17
+	//adc	x28,xzr,xzr		// moved above
+	cbnz	x27,Lsqr8x_tail
+					// note that carry flag is guaranteed
+					// to be zero at this point
+	ldp	x6,x7,[x2,#8*0]
+	sub	x27,x3,x1	// done yet?
+	sub	x16,x3,x5	// rewinded np
+	ldp	x8,x9,[x2,#8*2]
+	ldp	x10,x11,[x2,#8*4]
+	ldp	x12,x13,[x2,#8*6]
+	cbz	x27,Lsqr8x_tail_break
+
+	ldr	x4,[x0,#-8*8]
+	adds	x19,x19,x6
+	adcs	x20,x20,x7
+	ldp	x6,x7,[x1,#8*0]
+	adcs	x21,x21,x8
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x1,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x1,#8*4]
+	adcs	x25,x25,x12
+	mov	x27,#-8*8
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	//adc	x28,xzr,xzr		// moved above
+	b	Lsqr8x_tail
+
+.align	4
+Lsqr8x_tail_break:
+	ldr	x4,[x29,#112]		// pull n0
+	add	x27,x2,#8*8		// end of current t[num] window
+
+	subs	xzr,x30,#1		// "move" top-most carry to carry bit
+	adcs	x14,x19,x6
+	adcs	x15,x20,x7
+	ldp	x19,x20,[x0,#8*0]
+	adcs	x21,x21,x8
+	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
+	adcs	x22,x22,x9
+	ldp	x8,x9,[x16,#8*2]
+	adcs	x23,x23,x10
+	adcs	x24,x24,x11
+	ldp	x10,x11,[x16,#8*4]
+	adcs	x25,x25,x12
+	adcs	x26,x26,x13
+	ldp	x12,x13,[x16,#8*6]
+	add	x1,x16,#8*8
+	adc	x30,xzr,xzr	// top-most carry
+	mul	x28,x4,x19
+	stp	x14,x15,[x2,#8*0]
+	stp	x21,x22,[x2,#8*2]
+	ldp	x21,x22,[x0,#8*2]
+	stp	x23,x24,[x2,#8*4]
+	ldp	x23,x24,[x0,#8*4]
+	cmp	x27,x29		// did we hit the bottom?
+	stp	x25,x26,[x2,#8*6]
+	mov	x2,x0			// slide the window
+	ldp	x25,x26,[x0,#8*6]
+	mov	x27,#8
+	b.ne	Lsqr8x_reduction
+
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	ldr	x0,[x29,#96]		// pull rp
+	add	x2,x2,#8*8
+	subs	x14,x19,x6
+	sbcs	x15,x20,x7
+	sub	x27,x5,#8*8
+	mov	x3,x0		// x0 copy
+
+Lsqr8x_sub:
+	sbcs	x16,x21,x8
+	ldp	x6,x7,[x1,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x1,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x10,x11,[x1,#8*4]
+	sbcs	x17,x26,x13
+	ldp	x12,x13,[x1,#8*6]
+	add	x1,x1,#8*8
+	ldp	x19,x20,[x2,#8*0]
+	sub	x27,x27,#8*8
+	ldp	x21,x22,[x2,#8*2]
+	ldp	x23,x24,[x2,#8*4]
+	ldp	x25,x26,[x2,#8*6]
+	add	x2,x2,#8*8
+	stp	x14,x15,[x0,#8*4]
+	sbcs	x14,x19,x6
+	stp	x16,x17,[x0,#8*6]
+	add	x0,x0,#8*8
+	sbcs	x15,x20,x7
+	cbnz	x27,Lsqr8x_sub
+
+	sbcs	x16,x21,x8
+	mov	x2,sp
+	add	x1,sp,x5
+	ldp	x6,x7,[x3,#8*0]
+	sbcs	x17,x22,x9
+	stp	x14,x15,[x0,#8*0]
+	sbcs	x14,x23,x10
+	ldp	x8,x9,[x3,#8*2]
+	sbcs	x15,x24,x11
+	stp	x16,x17,[x0,#8*2]
+	sbcs	x16,x25,x12
+	ldp	x19,x20,[x1,#8*0]
+	sbcs	x17,x26,x13
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+	stp	x14,x15,[x0,#8*4]
+	stp	x16,x17,[x0,#8*6]
+
+	sub	x27,x5,#8*4
+Lsqr4x_cond_copy:
+	sub	x27,x27,#8*4
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	ldp	x6,x7,[x3,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x16,x21,x8,lo
+	stp	xzr,xzr,[x2,#8*2]
+	add	x2,x2,#8*4
+	csel	x17,x22,x9,lo
+	ldp	x8,x9,[x3,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	stp	xzr,xzr,[x1,#8*0]
+	stp	xzr,xzr,[x1,#8*2]
+	cbnz	x27,Lsqr4x_cond_copy
+
+	csel	x14,x19,x6,lo
+	stp	xzr,xzr,[x2,#8*0]
+	csel	x15,x20,x7,lo
+	stp	xzr,xzr,[x2,#8*2]
+	csel	x16,x21,x8,lo
+	csel	x17,x22,x9,lo
+	stp	x14,x15,[x3,#8*0]
+	stp	x16,x17,[x3,#8*2]
+
+	b	Lsqr8x_done
+
+.align	4
+Lsqr8x8_post_condition:
+	adc	x28,xzr,xzr
+	ldr	x30,[x29,#8]		// pull return address
+	// x19-7,x28 hold result, x6-7 hold modulus
+	subs	x6,x19,x6
+	ldr	x1,[x29,#96]		// pull rp
+	sbcs	x7,x20,x7
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x8
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x9
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	x10,x23,x10
+	stp	xzr,xzr,[sp,#8*6]
+	sbcs	x11,x24,x11
+	stp	xzr,xzr,[sp,#8*8]
+	sbcs	x12,x25,x12
+	stp	xzr,xzr,[sp,#8*10]
+	sbcs	x13,x26,x13
+	stp	xzr,xzr,[sp,#8*12]
+	sbcs	x28,x28,xzr	// did it borrow?
+	stp	xzr,xzr,[sp,#8*14]
+
+	// x6-7 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	csel	x10,x23,x10,lo
+	csel	x11,x24,x11,lo
+	stp	x8,x9,[x1,#8*2]
+	csel	x12,x25,x12,lo
+	csel	x13,x26,x13,lo
+	stp	x10,x11,[x1,#8*4]
+	stp	x12,x13,[x1,#8*6]
+
+Lsqr8x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.align	5
+__bn_mul4x_mont:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+	// return address.
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	sub	x26,sp,x5,lsl#3
+	lsl	x5,x5,#3
+	ldr	x4,[x4]		// *n0
+	sub	sp,x26,#8*4		// alloca
+
+	add	x10,x2,x5
+	add	x27,x1,x5
+	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
+
+	ldr	x24,[x2,#8*0]		// b[0]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	mov	x19,xzr
+	mov	x20,xzr
+	mov	x21,xzr
+	mov	x22,xzr
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x28,#0
+	mov	x26,sp
+
+Loop_mul4x_1st_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[0])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	sub	x10,x27,x1
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_reduction
+
+	cbz	x10,Lmul4x4_post_condition
+
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldr	x25,[sp]		// a[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+Loop_mul4x_1st_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[i])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i] (or b[0])
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	adcs	x23,x23,x0
+	umulh	x13,x17,x25
+	adc	x0,xzr,xzr
+	ldr	x25,[sp,x28]		// next t[0]*n0
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_1st_tail
+
+	sub	x11,x27,x5	// rewinded x1
+	cbz	x10,Lmul4x_proceed
+
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_1st_tail
+
+.align	5
+Lmul4x_proceed:
+	ldr	x24,[x2,#8*4]!		// *++b
+	adc	x30,x0,xzr
+	ldp	x6,x7,[x11,#8*0]	// a[0..3]
+	sub	x3,x3,x5		// rewind np
+	ldp	x8,x9,[x11,#8*2]
+	add	x1,x11,#8*4
+
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	ldp	x21,x22,[sp,#8*6]
+
+	ldp	x14,x15,[x3,#8*0]	// n[0..3]
+	mov	x26,sp
+	ldp	x16,x17,[x3,#8*2]
+	adds	x3,x3,#8*4		// clear carry bit
+	mov	x0,xzr
+
+.align	4
+Loop_mul4x_reduction:
+	mul	x10,x6,x24		// lo(a[0..3]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
+	adcs	x20,x20,x11
+	mul	x25,x19,x4		// t[0]*n0
+	adcs	x21,x21,x12
+	umulh	x11,x7,x24
+	adcs	x22,x22,x13
+	umulh	x12,x8,x24
+	adc	x23,xzr,xzr
+	umulh	x13,x9,x24
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	// (*)	mul	x10,x14,x25
+	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
+	adcs	x21,x21,x11
+	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	// (*)	adds	xzr,x19,x10
+	subs	xzr,x19,#1		// (*)
+	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
+	adcs	x19,x20,x11
+	umulh	x11,x15,x25
+	adcs	x20,x21,x12
+	umulh	x12,x16,x25
+	adcs	x21,x22,x13
+	umulh	x13,x17,x25
+	adcs	x22,x23,x0
+	adc	x0,xzr,xzr
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_reduction
+
+	adc	x0,x0,xzr
+	ldp	x10,x11,[x26,#8*4]	// t[4..7]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]	// a[4..7]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+
+	ldr	x25,[sp]		// t[0]*n0
+	ldp	x14,x15,[x3,#8*0]	// n[4..7]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+
+.align	4
+Loop_mul4x_tail:
+	mul	x10,x6,x24		// lo(a[4..7]*b[4])
+	adc	x0,x0,xzr	// modulo-scheduled
+	mul	x11,x7,x24
+	add	x28,x28,#8
+	mul	x12,x8,x24
+	and	x28,x28,#31
+	mul	x13,x9,x24
+	adds	x19,x19,x10
+	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
+	adcs	x20,x20,x11
+	umulh	x11,x7,x24
+	adcs	x21,x21,x12
+	umulh	x12,x8,x24
+	adcs	x22,x22,x13
+	umulh	x13,x9,x24
+	adc	x23,xzr,xzr
+	ldr	x24,[x2,x28]		// next b[i]
+	adds	x20,x20,x10
+	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
+	adcs	x21,x21,x11
+	mul	x11,x15,x25
+	adcs	x22,x22,x12
+	mul	x12,x16,x25
+	adc	x23,x23,x13		// can't overflow
+	mul	x13,x17,x25
+	adds	x19,x19,x10
+	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
+	adcs	x20,x20,x11
+	umulh	x11,x15,x25
+	adcs	x21,x21,x12
+	umulh	x12,x16,x25
+	adcs	x22,x22,x13
+	umulh	x13,x17,x25
+	adcs	x23,x23,x0
+	ldr	x25,[sp,x28]		// next a[0]*n0
+	adc	x0,xzr,xzr
+	str	x19,[x26],#8		// result!!!
+	adds	x19,x20,x10
+	sub	x10,x27,x1		// done yet?
+	adcs	x20,x21,x11
+	adcs	x21,x22,x12
+	adcs	x22,x23,x13
+	//adc	x0,x0,xzr
+	cbnz	x28,Loop_mul4x_tail
+
+	sub	x11,x3,x5		// rewinded np?
+	adc	x0,x0,xzr
+	cbz	x10,Loop_mul4x_break
+
+	ldp	x10,x11,[x26,#8*4]
+	ldp	x12,x13,[x26,#8*6]
+	ldp	x6,x7,[x1,#8*0]
+	ldp	x8,x9,[x1,#8*2]
+	add	x1,x1,#8*4
+	adds	x19,x19,x10
+	adcs	x20,x20,x11
+	adcs	x21,x21,x12
+	adcs	x22,x22,x13
+	//adc	x0,x0,xzr
+	ldp	x14,x15,[x3,#8*0]
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	b	Loop_mul4x_tail
+
+.align	4
+Loop_mul4x_break:
+	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
+	adds	x19,x19,x30
+	add	x2,x2,#8*4		// bp++
+	adcs	x20,x20,xzr
+	sub	x1,x1,x5		// rewind ap
+	adcs	x21,x21,xzr
+	stp	x19,x20,[x26,#8*0]	// result!!!
+	adcs	x22,x22,xzr
+	ldp	x19,x20,[sp,#8*4]	// t[0..3]
+	adc	x30,x0,xzr
+	stp	x21,x22,[x26,#8*2]	// result!!!
+	cmp	x2,x13			// done yet?
+	ldp	x21,x22,[sp,#8*6]
+	ldp	x14,x15,[x11,#8*0]	// n[0..3]
+	ldp	x16,x17,[x11,#8*2]
+	add	x3,x11,#8*4
+	b.eq	Lmul4x_post
+
+	ldr	x24,[x2]
+	ldp	x6,x7,[x1,#8*0]	// a[0..3]
+	ldp	x8,x9,[x1,#8*2]
+	adds	x1,x1,#8*4		// clear carry bit
+	mov	x0,xzr
+	mov	x26,sp
+	b	Loop_mul4x_reduction
+
+.align	4
+Lmul4x_post:
+	// Final step. We see if result is larger than modulus, and
+	// if it is, subtract the modulus. But comparison implies
+	// subtraction. So we subtract modulus, see if it borrowed,
+	// and conditionally copy original value.
+	mov	x0,x12
+	mov	x27,x12		// x0 copy
+	subs	x10,x19,x14
+	add	x26,sp,#8*8
+	sbcs	x11,x20,x15
+	sub	x28,x5,#8*4
+
+Lmul4x_sub:
+	sbcs	x12,x21,x16
+	ldp	x14,x15,[x3,#8*0]
+	sub	x28,x28,#8*4
+	ldp	x19,x20,[x26,#8*0]
+	sbcs	x13,x22,x17
+	ldp	x16,x17,[x3,#8*2]
+	add	x3,x3,#8*4
+	ldp	x21,x22,[x26,#8*2]
+	add	x26,x26,#8*4
+	stp	x10,x11,[x0,#8*0]
+	sbcs	x10,x19,x14
+	stp	x12,x13,[x0,#8*2]
+	add	x0,x0,#8*4
+	sbcs	x11,x20,x15
+	cbnz	x28,Lmul4x_sub
+
+	sbcs	x12,x21,x16
+	mov	x26,sp
+	add	x1,sp,#8*4
+	ldp	x6,x7,[x27,#8*0]
+	sbcs	x13,x22,x17
+	stp	x10,x11,[x0,#8*0]
+	ldp	x8,x9,[x27,#8*2]
+	stp	x12,x13,[x0,#8*2]
+	ldp	x19,x20,[x1,#8*0]
+	ldp	x21,x22,[x1,#8*2]
+	sbcs	xzr,x30,xzr	// did it borrow?
+	ldr	x30,[x29,#8]		// pull return address
+
+	sub	x28,x5,#8*4
+Lmul4x_cond_copy:
+	sub	x28,x28,#8*4
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	ldp	x6,x7,[x27,#8*4]
+	ldp	x19,x20,[x1,#8*4]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*2]
+	add	x26,x26,#8*4
+	csel	x13,x22,x9,lo
+	ldp	x8,x9,[x27,#8*6]
+	ldp	x21,x22,[x1,#8*6]
+	add	x1,x1,#8*4
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+	add	x27,x27,#8*4
+	cbnz	x28,Lmul4x_cond_copy
+
+	csel	x10,x19,x6,lo
+	stp	xzr,xzr,[x26,#8*0]
+	csel	x11,x20,x7,lo
+	stp	xzr,xzr,[x26,#8*2]
+	csel	x12,x21,x8,lo
+	stp	xzr,xzr,[x26,#8*3]
+	csel	x13,x22,x9,lo
+	stp	xzr,xzr,[x26,#8*4]
+	stp	x10,x11,[x27,#8*0]
+	stp	x12,x13,[x27,#8*2]
+
+	b	Lmul4x_done
+
+.align	4
+Lmul4x4_post_condition:
+	adc	x0,x0,xzr
+	ldr	x1,[x29,#96]		// pull rp
+	// x19-3,x0 hold result, x14-7 hold modulus
+	subs	x6,x19,x14
+	ldr	x30,[x29,#8]		// pull return address
+	sbcs	x7,x20,x15
+	stp	xzr,xzr,[sp,#8*0]
+	sbcs	x8,x21,x16
+	stp	xzr,xzr,[sp,#8*2]
+	sbcs	x9,x22,x17
+	stp	xzr,xzr,[sp,#8*4]
+	sbcs	xzr,x0,xzr		// did it borrow?
+	stp	xzr,xzr,[sp,#8*6]
+
+	// x6-3 hold result-modulus
+	csel	x6,x19,x6,lo
+	csel	x7,x20,x7,lo
+	csel	x8,x21,x8,lo
+	csel	x9,x22,x9,lo
+	stp	x6,x7,[x1,#8*0]
+	stp	x8,x9,[x1,#8*2]
+
+Lmul4x_done:
+	ldp	x19,x20,[x29,#16]
+	mov	sp,x29
+	ldp	x21,x22,[x29,#32]
+	mov	x0,#1
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldr	x29,[sp],#128
+	// x30 is popped earlier
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	4
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
new file mode 100644
index 0000000..5441afc
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -0,0 +1,343 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+
+.align	4
+_gcm_init_neon:
+	AARCH64_VALID_CALL_TARGET
+	// This function is adapted from gcm_init_v8. xC2 is t3.
+	ld1	{v17.2d}, [x1]			// load H
+	movi	v19.16b, #0xe1
+	shl	v19.2d, v19.2d, #57		// 0xc2.0
+	ext	v3.16b, v17.16b, v17.16b, #8
+	ushr	v18.2d, v19.2d, #63
+	dup	v17.4s, v17.s[1]
+	ext	v16.16b, v18.16b, v19.16b, #8	// t0=0xc2....01
+	ushr	v18.2d, v3.2d, #63
+	sshr	v17.4s, v17.4s, #31		// broadcast carry bit
+	and	v18.16b, v18.16b, v16.16b
+	shl	v3.2d, v3.2d, #1
+	ext	v18.16b, v18.16b, v18.16b, #8
+	and	v16.16b, v16.16b, v17.16b
+	orr	v3.16b, v3.16b, v18.16b	// H<<<=1
+	eor	v5.16b, v3.16b, v16.16b	// twisted H
+	st1	{v5.2d}, [x0]			// store Htable[0]
+	ret
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+
+.align	4
+_gcm_gmult_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v3.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v3.16b, v3.16b		// byteswap Xi
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+	mov	x3, #16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+
+.align	4
+_gcm_ghash_neon:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v0.16b}, [x0]		// load Xi
+	ld1	{v5.1d}, [x1], #8		// load twisted H
+	ld1	{v6.1d}, [x1]
+	adrp	x9, Lmasks@PAGE		// load constants
+	add	x9, x9, Lmasks@PAGEOFF
+	ld1	{v24.2d, v25.2d}, [x9]
+	rev64	v0.16b, v0.16b		// byteswap Xi
+	ext	v0.16b, v0.16b, v0.16b, #8
+	eor	v7.8b, v5.8b, v6.8b	// Karatsuba pre-processing
+
+Loop_neon:
+	ld1	{v3.16b}, [x2], #16	// load inp
+	rev64	v3.16b, v3.16b		// byteswap inp
+	ext	v3.16b, v3.16b, v3.16b, #8
+	eor	v3.16b, v3.16b, v0.16b	// inp ^= Xi
+
+Lgmult_neon:
+	// Split the input into v3 and v4. (The upper halves are unused,
+	// so it is okay to leave them alone.)
+	ins	v4.d[0], v3.d[1]
+	ext	v16.8b, v5.8b, v5.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v0.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v0.8h, v5.8b, v0.8b		// E = A*B1
+	ext	v17.8b, v5.8b, v5.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v5.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v5.8b, v5.8b, #3	// A3
+	eor	v16.16b, v16.16b, v0.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v0.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v0.8h, v5.8b, v0.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v0.16b	// N = I + J
+	pmull	v19.8h, v5.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v0.8h, v5.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v0.16b, v0.16b, v16.16b
+	eor	v0.16b, v0.16b, v18.16b
+	eor	v3.8b, v3.8b, v4.8b	// Karatsuba pre-processing
+	ext	v16.8b, v7.8b, v7.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v3.8b		// F = A1*B
+	ext	v1.8b, v3.8b, v3.8b, #1		// B1
+	pmull	v1.8h, v7.8b, v1.8b		// E = A*B1
+	ext	v17.8b, v7.8b, v7.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v3.8b		// H = A2*B
+	ext	v19.8b, v3.8b, v3.8b, #2	// B2
+	pmull	v19.8h, v7.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v7.8b, v7.8b, #3	// A3
+	eor	v16.16b, v16.16b, v1.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v3.8b		// J = A3*B
+	ext	v1.8b, v3.8b, v3.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v1.8h, v7.8b, v1.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v3.8b, v3.8b, #4	// B4
+	eor	v18.16b, v18.16b, v1.16b	// N = I + J
+	pmull	v19.8h, v7.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v1.8h, v7.8b, v3.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v1.16b, v1.16b, v16.16b
+	eor	v1.16b, v1.16b, v18.16b
+	ext	v16.8b, v6.8b, v6.8b, #1	// A1
+	pmull	v16.8h, v16.8b, v4.8b		// F = A1*B
+	ext	v2.8b, v4.8b, v4.8b, #1		// B1
+	pmull	v2.8h, v6.8b, v2.8b		// E = A*B1
+	ext	v17.8b, v6.8b, v6.8b, #2	// A2
+	pmull	v17.8h, v17.8b, v4.8b		// H = A2*B
+	ext	v19.8b, v4.8b, v4.8b, #2	// B2
+	pmull	v19.8h, v6.8b, v19.8b		// G = A*B2
+	ext	v18.8b, v6.8b, v6.8b, #3	// A3
+	eor	v16.16b, v16.16b, v2.16b	// L = E + F
+	pmull	v18.8h, v18.8b, v4.8b		// J = A3*B
+	ext	v2.8b, v4.8b, v4.8b, #3		// B3
+	eor	v17.16b, v17.16b, v19.16b	// M = G + H
+	pmull	v2.8h, v6.8b, v2.8b		// I = A*B3
+
+	// Here we diverge from the 32-bit version. It computes the following
+	// (instructions reordered for clarity):
+	//
+	//     veor	$t0#lo, $t0#lo, $t0#hi	@ t0 = P0 + P1 (L)
+	//     vand	$t0#hi, $t0#hi, $k48
+	//     veor	$t0#lo, $t0#lo, $t0#hi
+	//
+	//     veor	$t1#lo, $t1#lo, $t1#hi	@ t1 = P2 + P3 (M)
+	//     vand	$t1#hi, $t1#hi, $k32
+	//     veor	$t1#lo, $t1#lo, $t1#hi
+	//
+	//     veor	$t2#lo, $t2#lo, $t2#hi	@ t2 = P4 + P5 (N)
+	//     vand	$t2#hi, $t2#hi, $k16
+	//     veor	$t2#lo, $t2#lo, $t2#hi
+	//
+	//     veor	$t3#lo, $t3#lo, $t3#hi	@ t3 = P6 + P7 (K)
+	//     vmov.i64	$t3#hi, #0
+	//
+	// $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+	// upper halves of SIMD registers, so we must split each half into
+	// separate registers. To compensate, we pair computations up and
+	// parallelize.
+
+	ext	v19.8b, v4.8b, v4.8b, #4	// B4
+	eor	v18.16b, v18.16b, v2.16b	// N = I + J
+	pmull	v19.8h, v6.8b, v19.8b		// K = A*B4
+
+	// This can probably be scheduled more efficiently. For now, we just
+	// pair up independent instructions.
+	zip1	v20.2d, v16.2d, v17.2d
+	zip1	v22.2d, v18.2d, v19.2d
+	zip2	v21.2d, v16.2d, v17.2d
+	zip2	v23.2d, v18.2d, v19.2d
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	and	v21.16b, v21.16b, v24.16b
+	and	v23.16b, v23.16b, v25.16b
+	eor	v20.16b, v20.16b, v21.16b
+	eor	v22.16b, v22.16b, v23.16b
+	zip1	v16.2d, v20.2d, v21.2d
+	zip1	v18.2d, v22.2d, v23.2d
+	zip2	v17.2d, v20.2d, v21.2d
+	zip2	v19.2d, v22.2d, v23.2d
+
+	ext	v16.16b, v16.16b, v16.16b, #15	// t0 = t0 << 8
+	ext	v17.16b, v17.16b, v17.16b, #14	// t1 = t1 << 16
+	pmull	v2.8h, v6.8b, v4.8b		// D = A*B
+	ext	v19.16b, v19.16b, v19.16b, #12	// t3 = t3 << 32
+	ext	v18.16b, v18.16b, v18.16b, #13	// t2 = t2 << 24
+	eor	v16.16b, v16.16b, v17.16b
+	eor	v18.16b, v18.16b, v19.16b
+	eor	v2.16b, v2.16b, v16.16b
+	eor	v2.16b, v2.16b, v18.16b
+	ext	v16.16b, v0.16b, v2.16b, #8
+	eor	v1.16b, v1.16b, v0.16b	// Karatsuba post-processing
+	eor	v1.16b, v1.16b, v2.16b
+	eor	v1.16b, v1.16b, v16.16b	// Xm overlaps Xh.lo and Xl.hi
+	ins	v0.d[1], v1.d[0]		// Xh|Xl - 256-bit result
+	// This is a no-op due to the ins instruction below.
+	// ins	v2.d[0], v1.d[1]
+
+	// equivalent of reduction_avx from ghash-x86_64.pl
+	shl	v17.2d, v0.2d, #57		// 1st phase
+	shl	v18.2d, v0.2d, #62
+	eor	v18.16b, v18.16b, v17.16b	//
+	shl	v17.2d, v0.2d, #63
+	eor	v18.16b, v18.16b, v17.16b	//
+	// Note Xm contains {Xl.d[1], Xh.d[0]}.
+	eor	v18.16b, v18.16b, v1.16b
+	ins	v0.d[1], v18.d[0]		// Xl.d[1] ^= t2.d[0]
+	ins	v2.d[0], v18.d[1]		// Xh.d[0] ^= t2.d[1]
+
+	ushr	v18.2d, v0.2d, #1		// 2nd phase
+	eor	v2.16b, v2.16b,v0.16b
+	eor	v0.16b, v0.16b,v18.16b	//
+	ushr	v18.2d, v18.2d, #6
+	ushr	v0.2d, v0.2d, #1		//
+	eor	v0.16b, v0.16b, v2.16b	//
+	eor	v0.16b, v0.16b, v18.16b	//
+
+	subs	x3, x3, #16
+	bne	Loop_neon
+
+	rev64	v0.16b, v0.16b		// byteswap Xi and write
+	ext	v0.16b, v0.16b, v0.16b, #8
+	st1	{v0.16b}, [x0]
+
+	ret
+
+
+.section	__TEXT,__const
+.align	4
+Lmasks:
+.quad	0x0000ffffffffffff	// k48
+.quad	0x00000000ffffffff	// k32
+.quad	0x000000000000ffff	// k16
+.quad	0x0000000000000000	// k0
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
new file mode 100644
index 0000000..0ba0cdd
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -0,0 +1,573 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.globl	_gcm_init_v8
+.private_extern	_gcm_init_v8
+
+.align	4
+_gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x1]		//load input H
+	movi	v19.16b,#0xe1
+	shl	v19.2d,v19.2d,#57		//0xc2.0
+	ext	v3.16b,v17.16b,v17.16b,#8
+	ushr	v18.2d,v19.2d,#63
+	dup	v17.4s,v17.s[1]
+	ext	v16.16b,v18.16b,v19.16b,#8		//t0=0xc2....01
+	ushr	v18.2d,v3.2d,#63
+	sshr	v17.4s,v17.4s,#31		//broadcast carry bit
+	and	v18.16b,v18.16b,v16.16b
+	shl	v3.2d,v3.2d,#1
+	ext	v18.16b,v18.16b,v18.16b,#8
+	and	v16.16b,v16.16b,v17.16b
+	orr	v3.16b,v3.16b,v18.16b		//H<<<=1
+	eor	v20.16b,v3.16b,v16.16b		//twisted H
+	st1	{v20.2d},[x0],#16		//store Htable[0]
+
+	//calculate H^2
+	ext	v16.16b,v20.16b,v20.16b,#8		//Karatsuba pre-processing
+	pmull	v0.1q,v20.1d,v20.1d
+	eor	v16.16b,v16.16b,v20.16b
+	pmull2	v2.1q,v20.2d,v20.2d
+	pmull	v1.1q,v16.1d,v16.1d
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v22.16b,v0.16b,v18.16b
+
+	ext	v17.16b,v22.16b,v22.16b,#8		//Karatsuba pre-processing
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v21.2d,v22.2d},[x0],#32	//store Htable[1..2]
+	//calculate H^3 and H^4
+	pmull	v0.1q,v20.1d, v22.1d
+	pmull	v5.1q,v22.1d,v22.1d
+	pmull2	v2.1q,v20.2d, v22.2d
+	pmull2	v7.1q,v22.2d,v22.2d
+	pmull	v1.1q,v16.1d,v17.1d
+	pmull	v6.1q,v17.1d,v17.1d
+
+	ext	v16.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	ext	v17.16b,v5.16b,v7.16b,#8
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v16.16b
+	eor	v4.16b,v5.16b,v7.16b
+	eor	v6.16b,v6.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase
+	eor	v6.16b,v6.16b,v4.16b
+	pmull	v4.1q,v5.1d,v19.1d
+
+	ins	v2.d[0],v1.d[1]
+	ins	v7.d[0],v6.d[1]
+	ins	v1.d[1],v0.d[0]
+	ins	v6.d[1],v5.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+	eor	v5.16b,v6.16b,v4.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase
+	ext	v4.16b,v5.16b,v5.16b,#8
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v5.1q,v5.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v4.16b,v4.16b,v7.16b
+	eor	v20.16b, v0.16b,v18.16b		//H^3
+	eor	v22.16b,v5.16b,v4.16b		//H^4
+
+	ext	v16.16b,v20.16b, v20.16b,#8		//Karatsuba pre-processing
+	ext	v17.16b,v22.16b,v22.16b,#8
+	eor	v16.16b,v16.16b,v20.16b
+	eor	v17.16b,v17.16b,v22.16b
+	ext	v21.16b,v16.16b,v17.16b,#8		//pack Karatsuba pre-processed
+	st1	{v20.2d,v21.2d,v22.2d},[x0]		//store Htable[3..5]
+	ret
+
+.globl	_gcm_gmult_v8
+.private_extern	_gcm_gmult_v8
+
+.align	4
+_gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	ld1	{v17.2d},[x0]		//load Xi
+	movi	v19.16b,#0xe1
+	ld1	{v20.2d,v21.2d},[x1]	//load twisted H, ...
+	shl	v19.2d,v19.2d,#57
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v3.16b,v17.16b,v17.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.globl	_gcm_ghash_v8
+.private_extern	_gcm_ghash_v8
+
+.align	4
+_gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	cmp	x3,#64
+	b.hs	Lgcm_ghash_v8_4x
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+						//"[rotated]" means that
+						//loaded value would have
+						//to be rotated in order to
+						//make it appear as in
+						//algorithm specification
+	subs	x3,x3,#32		//see if x3 is 32 or larger
+	mov	x12,#16		//x12 is used as post-
+						//increment for input pointer;
+						//as loop is modulo-scheduled
+						//x12 is zeroed just in time
+						//to preclude overstepping
+						//inp[len], which means that
+						//last block[s] are actually
+						//loaded twice, but last
+						//copy is not processed
+	ld1	{v20.2d,v21.2d},[x1],#32	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v22.2d},[x1]
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	ext	v0.16b,v0.16b,v0.16b,#8		//rotate Xi
+	ld1	{v16.2d},[x2],#16	//load [rotated] I[0]
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v3.16b,v16.16b,v16.16b,#8		//rotate I[0]
+	b.lo	Lodd_tail_v8		//x3 was less than 32
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[1]
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ext	v7.16b,v17.16b,v17.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//I[i]^=Xi
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	pmull2	v6.1q,v20.2d,v7.2d
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	ext	v18.16b,v3.16b,v3.16b,#8
+	subs	x3,x3,#32		//is there more data?
+	pmull	v0.1q,v22.1d,v3.1d		//H^2.lo·Xi.lo
+	csel	x12,xzr,x12,lo			//is it time to zero x12?
+
+	pmull	v5.1q,v21.1d,v17.1d
+	eor	v18.16b,v18.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v22.2d,v3.2d		//H^2.hi·Xi.hi
+	eor	v0.16b,v0.16b,v4.16b		//accumulate
+	pmull2	v1.1q,v21.2d,v18.2d		//(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	ld1	{v16.2d},[x2],x12	//load [rotated] I[i+2]
+
+	eor	v2.16b,v2.16b,v6.16b
+	csel	x12,xzr,x12,eq			//is it time to zero x12?
+	eor	v1.16b,v1.16b,v5.16b
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v17.2d},[x2],x12	//load [rotated] I[i+3]
+#ifndef __AARCH64EB__
+	rev64	v16.16b,v16.16b
+#endif
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+#ifndef __AARCH64EB__
+	rev64	v17.16b,v17.16b
+#endif
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v7.16b,v17.16b,v17.16b,#8
+	ext	v3.16b,v16.16b,v16.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+	pmull	v4.1q,v20.1d,v7.1d		//H·Ii+1
+	eor	v3.16b,v3.16b,v2.16b		//accumulate v3.16b early
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v3.16b,v3.16b,v18.16b
+	eor	v17.16b,v17.16b,v7.16b		//Karatsuba pre-processing
+	eor	v3.16b,v3.16b,v0.16b
+	pmull2	v6.1q,v20.2d,v7.2d
+	b.hs	Loop_mod2x_v8		//there was at least 32 more bytes
+
+	eor	v2.16b,v2.16b,v18.16b
+	ext	v3.16b,v16.16b,v16.16b,#8		//re-construct v3.16b
+	adds	x3,x3,#32		//re-construct x3
+	eor	v0.16b,v0.16b,v2.16b		//re-construct v0.16b
+	b.eq	Ldone_v8		//is x3 zero?
+Lodd_tail_v8:
+	ext	v18.16b,v0.16b,v0.16b,#8
+	eor	v3.16b,v3.16b,v0.16b		//inp^=Xi
+	eor	v17.16b,v16.16b,v18.16b		//v17.16b is rotated inp^Xi
+
+	pmull	v0.1q,v20.1d,v3.1d		//H.lo·Xi.lo
+	eor	v17.16b,v17.16b,v3.16b		//Karatsuba pre-processing
+	pmull2	v2.1q,v20.2d,v3.2d		//H.hi·Xi.hi
+	pmull	v1.1q,v21.1d,v17.1d		//(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+
+Ldone_v8:
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	ext	v0.16b,v0.16b,v0.16b,#8
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+
+.align	4
+gcm_ghash_v8_4x:
+Lgcm_ghash_v8_4x:
+	ld1	{v0.2d},[x0]		//load [rotated] Xi
+	ld1	{v20.2d,v21.2d,v22.2d},[x1],#48	//load twisted H, ..., H^2
+	movi	v19.16b,#0xe1
+	ld1	{v26.2d,v27.2d,v28.2d},[x1]	//load twisted H^3, ..., H^4
+	shl	v19.2d,v19.2d,#57		//compose 0xc2.0 constant
+
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+	ext	v25.16b,v7.16b,v7.16b,#8
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#128
+	b.lo	Ltail4x
+
+	b	Loop4x
+
+.align	4
+Loop4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ld1	{v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+	ext	v3.16b,v16.16b,v16.16b,#8
+#ifndef __AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v7.16b,v7.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	ext	v25.16b,v7.16b,v7.16b,#8
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	ext	v24.16b,v6.16b,v6.16b,#8
+	eor	v1.16b,v1.16b,v30.16b
+	ext	v23.16b,v5.16b,v5.16b,#8
+
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	pmull	v29.1q,v20.1d,v25.1d		//H·Ii+3
+	eor	v7.16b,v7.16b,v25.16b
+	eor	v1.16b,v1.16b,v17.16b
+	pmull2	v31.1q,v20.2d,v25.2d
+	eor	v1.16b,v1.16b,v18.16b
+	pmull	v30.1q,v21.1d,v7.1d
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	pmull	v16.1q,v22.1d,v24.1d		//H^2·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+	pmull2	v24.1q,v22.2d,v24.2d
+	eor	v0.16b,v1.16b,v18.16b
+	pmull2	v6.1q,v21.2d,v6.2d
+
+	eor	v29.16b,v29.16b,v16.16b
+	eor	v31.16b,v31.16b,v24.16b
+	eor	v30.16b,v30.16b,v6.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	pmull	v7.1q,v26.1d,v23.1d		//H^3·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v23.1q,v26.2d,v23.2d
+	pmull	v5.1q,v27.1d,v5.1d
+
+	eor	v0.16b,v0.16b,v18.16b
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+	eor	v30.16b,v30.16b,v5.16b
+
+	subs	x3,x3,#64
+	b.hs	Loop4x
+
+Ltail4x:
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v28.1d,v3.1d		//H^4·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v28.2d,v3.2d
+	pmull2	v1.1q,v27.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+
+	adds	x3,x3,#64
+	b.eq	Ldone4x
+
+	cmp	x3,#32
+	b.lo	Lone
+	b.eq	Ltwo
+Lthree:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d,v6.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v6.16b,v6.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v24.16b,v6.16b,v6.16b,#8
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	pmull	v29.1q,v20.1d,v24.1d		//H·Ii+2
+	eor	v6.16b,v6.16b,v24.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	pmull2	v31.1q,v20.2d,v24.2d
+	pmull	v30.1q,v21.1d,v6.1d
+	eor	v0.16b,v0.16b,v18.16b
+	pmull	v7.1q,v22.1d,v23.1d		//H^2·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull2	v23.1q,v22.2d,v23.2d
+	eor	v16.16b,v4.16b,v0.16b
+	pmull2	v5.1q,v21.2d,v5.2d
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	eor	v29.16b,v29.16b,v7.16b
+	eor	v31.16b,v31.16b,v23.16b
+	eor	v30.16b,v30.16b,v5.16b
+
+	pmull	v0.1q,v26.1d,v3.1d		//H^3·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v26.2d,v3.2d
+	pmull	v1.1q,v27.1d,v16.1d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Ltwo:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d,v5.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v5.16b,v5.16b
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	ext	v23.16b,v5.16b,v5.16b,#8
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	pmull	v29.1q,v20.1d,v23.1d		//H·Ii+1
+	eor	v5.16b,v5.16b,v23.16b
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull2	v31.1q,v20.2d,v23.2d
+	pmull	v30.1q,v21.1d,v5.1d
+
+	pmull	v0.1q,v22.1d,v3.1d		//H^2·(Xi+Ii)
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v22.2d,v3.2d
+	pmull2	v1.1q,v21.2d,v16.2d
+
+	eor	v0.16b,v0.16b,v29.16b
+	eor	v2.16b,v2.16b,v31.16b
+	eor	v1.16b,v1.16b,v30.16b
+	b	Ldone4x
+
+.align	4
+Lone:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	ld1	{v4.2d},[x2]
+	eor	v1.16b,v1.16b,v18.16b
+#ifndef	__AARCH64EB__
+	rev64	v4.16b,v4.16b
+#endif
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+	eor	v16.16b,v4.16b,v0.16b
+	ext	v3.16b,v16.16b,v16.16b,#8
+
+	pmull	v0.1q,v20.1d,v3.1d
+	eor	v16.16b,v16.16b,v3.16b
+	pmull2	v2.1q,v20.2d,v3.2d
+	pmull	v1.1q,v21.1d,v16.1d
+
+Ldone4x:
+	ext	v17.16b,v0.16b,v2.16b,#8		//Karatsuba post-processing
+	eor	v18.16b,v0.16b,v2.16b
+	eor	v1.16b,v1.16b,v17.16b
+	eor	v1.16b,v1.16b,v18.16b
+
+	pmull	v18.1q,v0.1d,v19.1d		//1st phase of reduction
+	ins	v2.d[0],v1.d[1]
+	ins	v1.d[1],v0.d[0]
+	eor	v0.16b,v1.16b,v18.16b
+
+	ext	v18.16b,v0.16b,v0.16b,#8		//2nd phase of reduction
+	pmull	v0.1q,v0.1d,v19.1d
+	eor	v18.16b,v18.16b,v2.16b
+	eor	v0.16b,v0.16b,v18.16b
+	ext	v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __AARCH64EB__
+	rev64	v0.16b,v0.16b
+#endif
+	st1	{v0.2d},[x0]		//write out Xi
+
+	ret
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha1-armv8.S b/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
new file mode 100644
index 0000000..62ba800
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -0,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+
+.private_extern	_OPENSSL_armcap_P
+.globl	_sha1_block_data_order
+.private_extern	_sha1_block_data_order
+
+.align	6
+_sha1_block_data_order:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+	adrp	x16,_OPENSSL_armcap_P@PAGE
+#endif
+	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+	tst	w16,#ARMV8_SHA1
+	b.ne	Lv8_entry
+
+	stp	x29,x30,[sp,#-96]!
+	add	x29,sp,#0
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+
+	ldp	w20,w21,[x0]
+	ldp	w22,w23,[x0,#8]
+	ldr	w24,[x0,#16]
+
+Loop:
+	ldr	x3,[x1],#64
+	movz	w28,#0x7999
+	sub	x2,x2,#1
+	movk	w28,#0x5a82,lsl#16
+#ifdef	__AARCH64EB__
+	ror	x3,x3,#32
+#else
+	rev32	x3,x3
+#endif
+	add	w24,w24,w28		// warm it up
+	add	w24,w24,w3
+	lsr	x4,x3,#32
+	ldr	x5,[x1,#-56]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w4	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x5,x5,#32
+#else
+	rev32	x5,x5
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w5	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x6,x5,#32
+	ldr	x7,[x1,#-48]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w6	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x7,x7,#32
+#else
+	rev32	x7,x7
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w7	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x8,x7,#32
+	ldr	x9,[x1,#-40]
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w8	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x9,x9,#32
+#else
+	rev32	x9,x9
+#endif
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w9	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	lsr	x10,x9,#32
+	ldr	x11,[x1,#-32]
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w10	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x11,x11,#32
+#else
+	rev32	x11,x11
+#endif
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w11	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	lsr	x12,x11,#32
+	ldr	x13,[x1,#-24]
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w12	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x13,x13,#32
+#else
+	rev32	x13,x13
+#endif
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	add	w24,w24,w13	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	lsr	x14,x13,#32
+	ldr	x15,[x1,#-16]
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	add	w23,w23,w14	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x15,x15,#32
+#else
+	rev32	x15,x15
+#endif
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	add	w22,w22,w15	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	lsr	x16,x15,#32
+	ldr	x17,[x1,#-8]
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	add	w21,w21,w16	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+#ifdef	__AARCH64EB__
+	ror	x17,x17,#32
+#else
+	rev32	x17,x17
+#endif
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w17	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	lsr	x19,x17,#32
+	eor	w3,w3,w5
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w3,w3,w11
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w3,w3,w16
+	ror	w22,w22,#2
+	add	w24,w24,w19	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	bic	w25,w23,w21
+	and	w26,w22,w21
+	ror	w27,w20,#27
+	eor	w4,w4,w12
+	add	w23,w23,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w24,w24,w27		// e+=rot(a,5)
+	eor	w4,w4,w17
+	ror	w21,w21,#2
+	add	w23,w23,w3	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	bic	w25,w22,w20
+	and	w26,w21,w20
+	ror	w27,w24,#27
+	eor	w5,w5,w13
+	add	w22,w22,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w23,w23,w27		// e+=rot(a,5)
+	eor	w5,w5,w19
+	ror	w20,w20,#2
+	add	w22,w22,w4	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	bic	w25,w21,w24
+	and	w26,w20,w24
+	ror	w27,w23,#27
+	eor	w6,w6,w14
+	add	w21,w21,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w22,w22,w27		// e+=rot(a,5)
+	eor	w6,w6,w3
+	ror	w24,w24,#2
+	add	w21,w21,w5	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	bic	w25,w20,w23
+	and	w26,w24,w23
+	ror	w27,w22,#27
+	eor	w7,w7,w15
+	add	w20,w20,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w21,w21,w27		// e+=rot(a,5)
+	eor	w7,w7,w4
+	ror	w23,w23,#2
+	add	w20,w20,w6	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	movz	w28,#0xeba1
+	movk	w28,#0x6ed9,lsl#16
+	eor	w8,w8,w10
+	bic	w25,w24,w22
+	and	w26,w23,w22
+	ror	w27,w21,#27
+	eor	w8,w8,w16
+	add	w24,w24,w28		// future e+=K
+	orr	w25,w25,w26
+	add	w20,w20,w27		// e+=rot(a,5)
+	eor	w8,w8,w5
+	ror	w22,w22,#2
+	add	w24,w24,w7	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w9,w9,w6
+	add	w23,w23,w8	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w10,w10,w7
+	add	w22,w22,w9	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w11,w11,w8
+	add	w21,w21,w10	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w12,w12,w9
+	add	w20,w20,w11	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w13,w13,w10
+	add	w24,w24,w12	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w14,w14,w11
+	add	w23,w23,w13	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w15,w15,w12
+	add	w22,w22,w14	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w16,w16,w13
+	add	w21,w21,w15	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w17,w17,w14
+	add	w20,w20,w16	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w19,w19,w15
+	add	w24,w24,w17	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w3,w3,w16
+	add	w23,w23,w19	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w4,w4,w17
+	add	w22,w22,w3	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w5,w5,w19
+	add	w21,w21,w4	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w6,w6,w3
+	add	w20,w20,w5	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w7,w7,w4
+	add	w24,w24,w6	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w8,w8,w5
+	add	w23,w23,w7	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w9,w9,w6
+	add	w22,w22,w8	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w10,w10,w7
+	add	w21,w21,w9	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w11,w11,w8
+	add	w20,w20,w10	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	movz	w28,#0xbcdc
+	movk	w28,#0x8f1b,lsl#16
+	eor	w12,w12,w14
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w12,w12,w9
+	add	w24,w24,w11	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w13,w13,w15
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w13,w13,w10
+	add	w23,w23,w12	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w14,w14,w16
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w14,w14,w11
+	add	w22,w22,w13	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w15,w15,w17
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w15,w15,w12
+	add	w21,w21,w14	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w16,w16,w19
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w16,w16,w13
+	add	w20,w20,w15	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w17,w17,w3
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w17,w17,w9
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w17,w17,w14
+	add	w24,w24,w16	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w19,w19,w4
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w19,w19,w10
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w19,w19,w15
+	add	w23,w23,w17	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w3,w3,w5
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w3,w3,w11
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w3,w3,w16
+	add	w22,w22,w19	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w4,w4,w6
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w4,w4,w12
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w4,w4,w17
+	add	w21,w21,w3	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w5,w5,w7
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w5,w5,w13
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w5,w5,w19
+	add	w20,w20,w4	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w6,w6,w8
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w6,w6,w14
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w6,w6,w3
+	add	w24,w24,w5	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w7,w7,w9
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w7,w7,w15
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w7,w7,w4
+	add	w23,w23,w6	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w8,w8,w10
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w8,w8,w16
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w8,w8,w5
+	add	w22,w22,w7	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w9,w9,w11
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w9,w9,w17
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w9,w9,w6
+	add	w21,w21,w8	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w10,w10,w12
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w10,w10,w19
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w10,w10,w7
+	add	w20,w20,w9	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w11,w11,w13
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w11,w11,w3
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w11,w11,w8
+	add	w24,w24,w10	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	orr	w25,w21,w22
+	and	w26,w21,w22
+	eor	w12,w12,w14
+	ror	w27,w20,#27
+	and	w25,w25,w23
+	add	w23,w23,w28		// future e+=K
+	eor	w12,w12,w4
+	add	w24,w24,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w21,w21,#2
+	eor	w12,w12,w9
+	add	w23,w23,w11	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	orr	w25,w20,w21
+	and	w26,w20,w21
+	eor	w13,w13,w15
+	ror	w27,w24,#27
+	and	w25,w25,w22
+	add	w22,w22,w28		// future e+=K
+	eor	w13,w13,w5
+	add	w23,w23,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w20,w20,#2
+	eor	w13,w13,w10
+	add	w22,w22,w12	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	orr	w25,w24,w20
+	and	w26,w24,w20
+	eor	w14,w14,w16
+	ror	w27,w23,#27
+	and	w25,w25,w21
+	add	w21,w21,w28		// future e+=K
+	eor	w14,w14,w6
+	add	w22,w22,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w24,w24,#2
+	eor	w14,w14,w11
+	add	w21,w21,w13	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	orr	w25,w23,w24
+	and	w26,w23,w24
+	eor	w15,w15,w17
+	ror	w27,w22,#27
+	and	w25,w25,w20
+	add	w20,w20,w28		// future e+=K
+	eor	w15,w15,w7
+	add	w21,w21,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w23,w23,#2
+	eor	w15,w15,w12
+	add	w20,w20,w14	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	movz	w28,#0xc1d6
+	movk	w28,#0xca62,lsl#16
+	orr	w25,w22,w23
+	and	w26,w22,w23
+	eor	w16,w16,w19
+	ror	w27,w21,#27
+	and	w25,w25,w24
+	add	w24,w24,w28		// future e+=K
+	eor	w16,w16,w8
+	add	w20,w20,w27		// e+=rot(a,5)
+	orr	w25,w25,w26
+	ror	w22,w22,#2
+	eor	w16,w16,w13
+	add	w24,w24,w15	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w17,w17,w14
+	add	w23,w23,w16	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w19,w19,w15
+	add	w22,w22,w17	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	eor	w3,w3,w5
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w3,w3,w11
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w3,w3,w16
+	add	w21,w21,w19	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w3,w3,#31
+	eor	w4,w4,w6
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w4,w4,w12
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w4,w4,w17
+	add	w20,w20,w3	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w4,w4,#31
+	eor	w5,w5,w7
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w5,w5,w13
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w5,w5,w19
+	add	w24,w24,w4	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w5,w5,#31
+	eor	w6,w6,w8
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w6,w6,w14
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w6,w6,w3
+	add	w23,w23,w5	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w6,w6,#31
+	eor	w7,w7,w9
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w7,w7,w15
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w7,w7,w4
+	add	w22,w22,w6	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w7,w7,#31
+	eor	w8,w8,w10
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w8,w8,w16
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w8,w8,w5
+	add	w21,w21,w7	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w8,w8,#31
+	eor	w9,w9,w11
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w9,w9,w17
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w9,w9,w6
+	add	w20,w20,w8	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w9,w9,#31
+	eor	w10,w10,w12
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w10,w10,w19
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w10,w10,w7
+	add	w24,w24,w9	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w10,w10,#31
+	eor	w11,w11,w13
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w11,w11,w3
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w11,w11,w8
+	add	w23,w23,w10	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w11,w11,#31
+	eor	w12,w12,w14
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w12,w12,w4
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w12,w12,w9
+	add	w22,w22,w11	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w12,w12,#31
+	eor	w13,w13,w15
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w13,w13,w5
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w13,w13,w10
+	add	w21,w21,w12	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w13,w13,#31
+	eor	w14,w14,w16
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w14,w14,w6
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	eor	w14,w14,w11
+	add	w20,w20,w13	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ror	w14,w14,#31
+	eor	w15,w15,w17
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	add	w24,w24,w28		// future e+=K
+	eor	w15,w15,w7
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	eor	w15,w15,w12
+	add	w24,w24,w14	// future e+=X[i]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	ror	w15,w15,#31
+	eor	w16,w16,w19
+	eor	w25,w23,w21
+	ror	w27,w20,#27
+	add	w23,w23,w28		// future e+=K
+	eor	w16,w16,w8
+	eor	w25,w25,w22
+	add	w24,w24,w27		// e+=rot(a,5)
+	ror	w21,w21,#2
+	eor	w16,w16,w13
+	add	w23,w23,w15	// future e+=X[i]
+	add	w24,w24,w25		// e+=F(b,c,d)
+	ror	w16,w16,#31
+	eor	w17,w17,w3
+	eor	w25,w22,w20
+	ror	w27,w24,#27
+	add	w22,w22,w28		// future e+=K
+	eor	w17,w17,w9
+	eor	w25,w25,w21
+	add	w23,w23,w27		// e+=rot(a,5)
+	ror	w20,w20,#2
+	eor	w17,w17,w14
+	add	w22,w22,w16	// future e+=X[i]
+	add	w23,w23,w25		// e+=F(b,c,d)
+	ror	w17,w17,#31
+	eor	w19,w19,w4
+	eor	w25,w21,w24
+	ror	w27,w23,#27
+	add	w21,w21,w28		// future e+=K
+	eor	w19,w19,w10
+	eor	w25,w25,w20
+	add	w22,w22,w27		// e+=rot(a,5)
+	ror	w24,w24,#2
+	eor	w19,w19,w15
+	add	w21,w21,w17	// future e+=X[i]
+	add	w22,w22,w25		// e+=F(b,c,d)
+	ror	w19,w19,#31
+	ldp	w4,w5,[x0]
+	eor	w25,w20,w23
+	ror	w27,w22,#27
+	add	w20,w20,w28		// future e+=K
+	eor	w25,w25,w24
+	add	w21,w21,w27		// e+=rot(a,5)
+	ror	w23,w23,#2
+	add	w20,w20,w19	// future e+=X[i]
+	add	w21,w21,w25		// e+=F(b,c,d)
+	ldp	w6,w7,[x0,#8]
+	eor	w25,w24,w22
+	ror	w27,w21,#27
+	eor	w25,w25,w23
+	add	w20,w20,w27		// e+=rot(a,5)
+	ror	w22,w22,#2
+	ldr	w8,[x0,#16]
+	add	w20,w20,w25		// e+=F(b,c,d)
+	add	w21,w21,w5
+	add	w22,w22,w6
+	add	w20,w20,w4
+	add	w23,w23,w7
+	add	w24,w24,w8
+	stp	w20,w21,[x0]
+	stp	w22,w23,[x0,#8]
+	str	w24,[x0,#16]
+	cbnz	x2,Loop
+
+	ldp	x19,x20,[sp,#16]
+	ldp	x21,x22,[sp,#32]
+	ldp	x23,x24,[sp,#48]
+	ldp	x25,x26,[sp,#64]
+	ldp	x27,x28,[sp,#80]
+	ldr	x29,[sp],#96
+	ret
+
+
+.align	6
+sha1_block_armv8:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	AARCH64_VALID_CALL_TARGET
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	adrp	x4,Lconst@PAGE
+	add	x4,x4,Lconst@PAGEOFF
+	eor	v1.16b,v1.16b,v1.16b
+	ld1	{v0.4s},[x0],#16
+	ld1	{v1.s}[0],[x0]
+	sub	x0,x0,#16
+	ld1	{v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+
+	add	v20.4s,v16.4s,v4.4s
+	rev32	v6.16b,v6.16b
+	orr	v22.16b,v0.16b,v0.16b	// offload
+
+	add	v21.4s,v16.4s,v5.4s
+	rev32	v7.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b
+.long	0x5e140020	//sha1c v0.16b,v1.16b,v20.4s		// 0
+	add	v20.4s,v16.4s,v6.4s
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 1
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v16.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 2
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v16.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 3
+.long	0x5e150060	//sha1c v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 4
+.long	0x5e140040	//sha1c v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 5
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 6
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v17.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 7
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v17.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 8
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 9
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 10
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 11
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v18.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 12
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v18.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e0630a4	//sha1su0 v4.16b,v5.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 13
+.long	0x5e152060	//sha1m v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+.long	0x5e2818e4	//sha1su1 v4.16b,v7.16b
+.long	0x5e0730c5	//sha1su0 v5.16b,v6.16b,v7.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 14
+.long	0x5e142040	//sha1m v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v4.4s
+.long	0x5e281885	//sha1su1 v5.16b,v4.16b
+.long	0x5e0430e6	//sha1su0 v6.16b,v7.16b,v4.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 15
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v5.4s
+.long	0x5e2818a6	//sha1su1 v6.16b,v5.16b
+.long	0x5e053087	//sha1su0 v7.16b,v4.16b,v5.16b
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 16
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+	add	v20.4s,v19.4s,v6.4s
+.long	0x5e2818c7	//sha1su1 v7.16b,v6.16b
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 17
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+	add	v21.4s,v19.4s,v7.4s
+
+.long	0x5e280803	//sha1h v3.16b,v0.16b		// 18
+.long	0x5e141040	//sha1p v0.16b,v2.16b,v20.4s
+
+.long	0x5e280802	//sha1h v2.16b,v0.16b		// 19
+.long	0x5e151060	//sha1p v0.16b,v3.16b,v21.4s
+
+	add	v1.4s,v1.4s,v2.4s
+	add	v0.4s,v0.4s,v22.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s},[x0],#16
+	st1	{v1.s}[0],[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+.section	__TEXT,__const
+.align	6
+Lconst:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	//K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	//K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	//K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	//K_60_79
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha256-armv8.S b/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
new file mode 100644
index 0000000..b40b260
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -0,0 +1,1212 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.private_extern	_OPENSSL_armcap_P
+.globl	_sha256_block_data_order
+.private_extern	_sha256_block_data_order
+
+.align	6
+_sha256_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+	adrp	x16,_OPENSSL_armcap_P@PAGE
+#endif
+	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+	tst	w16,#ARMV8_SHA256
+	b.ne	Lv8_entry
+#endif
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*4
+
+	ldp	w20,w21,[x0]				// load context
+	ldp	w22,w23,[x0,#2*4]
+	ldp	w24,w25,[x0,#4*4]
+	add	x2,x1,x2,lsl#6	// end of input
+	ldp	w26,w27,[x0,#6*4]
+	adrp	x30,LK256@PAGE
+	add	x30,x30,LK256@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	w3,w4,[x1],#2*4
+	ldr	w19,[x30],#4			// *K++
+	eor	w28,w21,w22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	w3,w3			// 0
+#endif
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w6,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w3			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w4,w4			// 1
+#endif
+	ldp	w5,w6,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w7,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w4			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w5,w5			// 2
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w8,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w5			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w6,w6			// 3
+#endif
+	ldp	w7,w8,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w9,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w6			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w7,w7			// 4
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w10,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w7			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w10,ror#11	// Sigma1(e)
+	ror	w10,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w10,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w8,w8			// 5
+#endif
+	ldp	w9,w10,[x1],#2*4
+	add	w23,w23,w17			// h+=Sigma0(a)
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w11,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w8			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w11,ror#11	// Sigma1(e)
+	ror	w11,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w11,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w9,w9			// 6
+#endif
+	add	w22,w22,w17			// h+=Sigma0(a)
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w12,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w9			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w12,ror#11	// Sigma1(e)
+	ror	w12,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w12,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w10,w10			// 7
+#endif
+	ldp	w11,w12,[x1],#2*4
+	add	w21,w21,w17			// h+=Sigma0(a)
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	eor	w13,w25,w25,ror#14
+	and	w17,w26,w25
+	bic	w28,w27,w25
+	add	w20,w20,w10			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w13,ror#11	// Sigma1(e)
+	ror	w13,w21,#2
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	eor	w17,w21,w21,ror#9
+	add	w20,w20,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w24,w24,w20			// d+=h
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w13,w17,ror#13	// Sigma0(a)
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w20,w20,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w11,w11			// 8
+#endif
+	add	w20,w20,w17			// h+=Sigma0(a)
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	eor	w14,w24,w24,ror#14
+	and	w17,w25,w24
+	bic	w19,w26,w24
+	add	w27,w27,w11			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w14,ror#11	// Sigma1(e)
+	ror	w14,w20,#2
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	eor	w17,w20,w20,ror#9
+	add	w27,w27,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w23,w23,w27			// d+=h
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w14,w17,ror#13	// Sigma0(a)
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w27,w27,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w12,w12			// 9
+#endif
+	ldp	w13,w14,[x1],#2*4
+	add	w27,w27,w17			// h+=Sigma0(a)
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	eor	w15,w23,w23,ror#14
+	and	w17,w24,w23
+	bic	w28,w25,w23
+	add	w26,w26,w12			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w15,ror#11	// Sigma1(e)
+	ror	w15,w27,#2
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	eor	w17,w27,w27,ror#9
+	add	w26,w26,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w22,w22,w26			// d+=h
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w15,w17,ror#13	// Sigma0(a)
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w26,w26,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w13,w13			// 10
+#endif
+	add	w26,w26,w17			// h+=Sigma0(a)
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	eor	w0,w22,w22,ror#14
+	and	w17,w23,w22
+	bic	w19,w24,w22
+	add	w25,w25,w13			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w0,ror#11	// Sigma1(e)
+	ror	w0,w26,#2
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	eor	w17,w26,w26,ror#9
+	add	w25,w25,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w21,w21,w25			// d+=h
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w0,w17,ror#13	// Sigma0(a)
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w25,w25,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w14,w14			// 11
+#endif
+	ldp	w15,w0,[x1],#2*4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	eor	w6,w21,w21,ror#14
+	and	w17,w22,w21
+	bic	w28,w23,w21
+	add	w24,w24,w14			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w6,ror#11	// Sigma1(e)
+	ror	w6,w25,#2
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	eor	w17,w25,w25,ror#9
+	add	w24,w24,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w20,w20,w24			// d+=h
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w17,ror#13	// Sigma0(a)
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w24,w24,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w15,w15			// 12
+#endif
+	add	w24,w24,w17			// h+=Sigma0(a)
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	eor	w7,w20,w20,ror#14
+	and	w17,w21,w20
+	bic	w19,w22,w20
+	add	w23,w23,w15			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w7,ror#11	// Sigma1(e)
+	ror	w7,w24,#2
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	eor	w17,w24,w24,ror#9
+	add	w23,w23,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w27,w27,w23			// d+=h
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w17,ror#13	// Sigma0(a)
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w23,w23,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w0,w0			// 13
+#endif
+	ldp	w1,w2,[x1]
+	add	w23,w23,w17			// h+=Sigma0(a)
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	eor	w8,w27,w27,ror#14
+	and	w17,w20,w27
+	bic	w28,w21,w27
+	add	w22,w22,w0			// h+=X[i]
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w8,ror#11	// Sigma1(e)
+	ror	w8,w23,#2
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	eor	w17,w23,w23,ror#9
+	add	w22,w22,w16			// h+=Sigma1(e)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	add	w26,w26,w22			// d+=h
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w17,ror#13	// Sigma0(a)
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	//add	w22,w22,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w1,w1			// 14
+#endif
+	ldr	w6,[sp,#12]
+	add	w22,w22,w17			// h+=Sigma0(a)
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	eor	w9,w26,w26,ror#14
+	and	w17,w27,w26
+	bic	w19,w20,w26
+	add	w21,w21,w1			// h+=X[i]
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w9,ror#11	// Sigma1(e)
+	ror	w9,w22,#2
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	eor	w17,w22,w22,ror#9
+	add	w21,w21,w16			// h+=Sigma1(e)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	add	w25,w25,w21			// d+=h
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w17,ror#13	// Sigma0(a)
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	//add	w21,w21,w17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	w2,w2			// 15
+#endif
+	ldr	w7,[sp,#0]
+	add	w21,w21,w17			// h+=Sigma0(a)
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+Loop_16_xx:
+	ldr	w8,[sp,#4]
+	str	w11,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w10,w5,#7
+	and	w17,w25,w24
+	ror	w9,w2,#17
+	bic	w19,w26,w24
+	ror	w11,w20,#2
+	add	w27,w27,w3			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w10,w10,w5,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w11,w11,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w9,w9,w2,ror#19
+	eor	w10,w10,w5,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w11,w20,ror#22	// Sigma0(a)
+	eor	w9,w9,w2,lsr#10	// sigma1(X[i+14])
+	add	w4,w4,w13
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w4,w4,w10
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w4,w4,w9
+	ldr	w9,[sp,#8]
+	str	w12,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w11,w6,#7
+	and	w17,w24,w23
+	ror	w10,w3,#17
+	bic	w28,w25,w23
+	ror	w12,w27,#2
+	add	w26,w26,w4			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w11,w11,w6,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w12,w12,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w10,w10,w3,ror#19
+	eor	w11,w11,w6,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w12,w27,ror#22	// Sigma0(a)
+	eor	w10,w10,w3,lsr#10	// sigma1(X[i+14])
+	add	w5,w5,w14
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w5,w5,w11
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w5,w5,w10
+	ldr	w10,[sp,#12]
+	str	w13,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w12,w7,#7
+	and	w17,w23,w22
+	ror	w11,w4,#17
+	bic	w19,w24,w22
+	ror	w13,w26,#2
+	add	w25,w25,w5			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w12,w12,w7,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w13,w13,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w11,w11,w4,ror#19
+	eor	w12,w12,w7,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w13,w26,ror#22	// Sigma0(a)
+	eor	w11,w11,w4,lsr#10	// sigma1(X[i+14])
+	add	w6,w6,w15
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w6,w6,w12
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w6,w6,w11
+	ldr	w11,[sp,#0]
+	str	w14,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w13,w8,#7
+	and	w17,w22,w21
+	ror	w12,w5,#17
+	bic	w28,w23,w21
+	ror	w14,w25,#2
+	add	w24,w24,w6			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w13,w13,w8,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w14,w14,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w12,w12,w5,ror#19
+	eor	w13,w13,w8,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w14,w25,ror#22	// Sigma0(a)
+	eor	w12,w12,w5,lsr#10	// sigma1(X[i+14])
+	add	w7,w7,w0
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w7,w7,w13
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w7,w7,w12
+	ldr	w12,[sp,#4]
+	str	w15,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w14,w9,#7
+	and	w17,w21,w20
+	ror	w13,w6,#17
+	bic	w19,w22,w20
+	ror	w15,w24,#2
+	add	w23,w23,w7			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w14,w14,w9,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w15,w15,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w13,w13,w6,ror#19
+	eor	w14,w14,w9,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w15,w24,ror#22	// Sigma0(a)
+	eor	w13,w13,w6,lsr#10	// sigma1(X[i+14])
+	add	w8,w8,w1
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w8,w8,w14
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w8,w8,w13
+	ldr	w13,[sp,#8]
+	str	w0,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w15,w10,#7
+	and	w17,w20,w27
+	ror	w14,w7,#17
+	bic	w28,w21,w27
+	ror	w0,w23,#2
+	add	w22,w22,w8			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w15,w15,w10,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w0,w0,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w14,w14,w7,ror#19
+	eor	w15,w15,w10,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w0,w23,ror#22	// Sigma0(a)
+	eor	w14,w14,w7,lsr#10	// sigma1(X[i+14])
+	add	w9,w9,w2
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w9,w9,w15
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w9,w9,w14
+	ldr	w14,[sp,#12]
+	str	w1,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w0,w11,#7
+	and	w17,w27,w26
+	ror	w15,w8,#17
+	bic	w19,w20,w26
+	ror	w1,w22,#2
+	add	w21,w21,w9			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w0,w0,w11,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w1,w1,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w15,w15,w8,ror#19
+	eor	w0,w0,w11,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w1,w22,ror#22	// Sigma0(a)
+	eor	w15,w15,w8,lsr#10	// sigma1(X[i+14])
+	add	w10,w10,w3
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w10,w10,w0
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w10,w10,w15
+	ldr	w15,[sp,#0]
+	str	w2,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w1,w12,#7
+	and	w17,w26,w25
+	ror	w0,w9,#17
+	bic	w28,w27,w25
+	ror	w2,w21,#2
+	add	w20,w20,w10			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w1,w1,w12,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w2,w2,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w0,w0,w9,ror#19
+	eor	w1,w1,w12,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w2,w21,ror#22	// Sigma0(a)
+	eor	w0,w0,w9,lsr#10	// sigma1(X[i+14])
+	add	w11,w11,w4
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w11,w11,w1
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w11,w11,w0
+	ldr	w0,[sp,#4]
+	str	w3,[sp,#0]
+	ror	w16,w24,#6
+	add	w27,w27,w19			// h+=K[i]
+	ror	w2,w13,#7
+	and	w17,w25,w24
+	ror	w1,w10,#17
+	bic	w19,w26,w24
+	ror	w3,w20,#2
+	add	w27,w27,w11			// h+=X[i]
+	eor	w16,w16,w24,ror#11
+	eor	w2,w2,w13,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w20,w21			// a^b, b^c in next round
+	eor	w16,w16,w24,ror#25	// Sigma1(e)
+	eor	w3,w3,w20,ror#13
+	add	w27,w27,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w1,w1,w10,ror#19
+	eor	w2,w2,w13,lsr#3	// sigma0(X[i+1])
+	add	w27,w27,w16			// h+=Sigma1(e)
+	eor	w28,w28,w21			// Maj(a,b,c)
+	eor	w17,w3,w20,ror#22	// Sigma0(a)
+	eor	w1,w1,w10,lsr#10	// sigma1(X[i+14])
+	add	w12,w12,w5
+	add	w23,w23,w27			// d+=h
+	add	w27,w27,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w12,w12,w2
+	add	w27,w27,w17			// h+=Sigma0(a)
+	add	w12,w12,w1
+	ldr	w1,[sp,#8]
+	str	w4,[sp,#4]
+	ror	w16,w23,#6
+	add	w26,w26,w28			// h+=K[i]
+	ror	w3,w14,#7
+	and	w17,w24,w23
+	ror	w2,w11,#17
+	bic	w28,w25,w23
+	ror	w4,w27,#2
+	add	w26,w26,w12			// h+=X[i]
+	eor	w16,w16,w23,ror#11
+	eor	w3,w3,w14,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w27,w20			// a^b, b^c in next round
+	eor	w16,w16,w23,ror#25	// Sigma1(e)
+	eor	w4,w4,w27,ror#13
+	add	w26,w26,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w2,w2,w11,ror#19
+	eor	w3,w3,w14,lsr#3	// sigma0(X[i+1])
+	add	w26,w26,w16			// h+=Sigma1(e)
+	eor	w19,w19,w20			// Maj(a,b,c)
+	eor	w17,w4,w27,ror#22	// Sigma0(a)
+	eor	w2,w2,w11,lsr#10	// sigma1(X[i+14])
+	add	w13,w13,w6
+	add	w22,w22,w26			// d+=h
+	add	w26,w26,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w13,w13,w3
+	add	w26,w26,w17			// h+=Sigma0(a)
+	add	w13,w13,w2
+	ldr	w2,[sp,#12]
+	str	w5,[sp,#8]
+	ror	w16,w22,#6
+	add	w25,w25,w19			// h+=K[i]
+	ror	w4,w15,#7
+	and	w17,w23,w22
+	ror	w3,w12,#17
+	bic	w19,w24,w22
+	ror	w5,w26,#2
+	add	w25,w25,w13			// h+=X[i]
+	eor	w16,w16,w22,ror#11
+	eor	w4,w4,w15,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w26,w27			// a^b, b^c in next round
+	eor	w16,w16,w22,ror#25	// Sigma1(e)
+	eor	w5,w5,w26,ror#13
+	add	w25,w25,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w3,w3,w12,ror#19
+	eor	w4,w4,w15,lsr#3	// sigma0(X[i+1])
+	add	w25,w25,w16			// h+=Sigma1(e)
+	eor	w28,w28,w27			// Maj(a,b,c)
+	eor	w17,w5,w26,ror#22	// Sigma0(a)
+	eor	w3,w3,w12,lsr#10	// sigma1(X[i+14])
+	add	w14,w14,w7
+	add	w21,w21,w25			// d+=h
+	add	w25,w25,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w14,w14,w4
+	add	w25,w25,w17			// h+=Sigma0(a)
+	add	w14,w14,w3
+	ldr	w3,[sp,#0]
+	str	w6,[sp,#12]
+	ror	w16,w21,#6
+	add	w24,w24,w28			// h+=K[i]
+	ror	w5,w0,#7
+	and	w17,w22,w21
+	ror	w4,w13,#17
+	bic	w28,w23,w21
+	ror	w6,w25,#2
+	add	w24,w24,w14			// h+=X[i]
+	eor	w16,w16,w21,ror#11
+	eor	w5,w5,w0,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w25,w26			// a^b, b^c in next round
+	eor	w16,w16,w21,ror#25	// Sigma1(e)
+	eor	w6,w6,w25,ror#13
+	add	w24,w24,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w4,w4,w13,ror#19
+	eor	w5,w5,w0,lsr#3	// sigma0(X[i+1])
+	add	w24,w24,w16			// h+=Sigma1(e)
+	eor	w19,w19,w26			// Maj(a,b,c)
+	eor	w17,w6,w25,ror#22	// Sigma0(a)
+	eor	w4,w4,w13,lsr#10	// sigma1(X[i+14])
+	add	w15,w15,w8
+	add	w20,w20,w24			// d+=h
+	add	w24,w24,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w15,w15,w5
+	add	w24,w24,w17			// h+=Sigma0(a)
+	add	w15,w15,w4
+	ldr	w4,[sp,#4]
+	str	w7,[sp,#0]
+	ror	w16,w20,#6
+	add	w23,w23,w19			// h+=K[i]
+	ror	w6,w1,#7
+	and	w17,w21,w20
+	ror	w5,w14,#17
+	bic	w19,w22,w20
+	ror	w7,w24,#2
+	add	w23,w23,w15			// h+=X[i]
+	eor	w16,w16,w20,ror#11
+	eor	w6,w6,w1,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w24,w25			// a^b, b^c in next round
+	eor	w16,w16,w20,ror#25	// Sigma1(e)
+	eor	w7,w7,w24,ror#13
+	add	w23,w23,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w5,w5,w14,ror#19
+	eor	w6,w6,w1,lsr#3	// sigma0(X[i+1])
+	add	w23,w23,w16			// h+=Sigma1(e)
+	eor	w28,w28,w25			// Maj(a,b,c)
+	eor	w17,w7,w24,ror#22	// Sigma0(a)
+	eor	w5,w5,w14,lsr#10	// sigma1(X[i+14])
+	add	w0,w0,w9
+	add	w27,w27,w23			// d+=h
+	add	w23,w23,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w0,w0,w6
+	add	w23,w23,w17			// h+=Sigma0(a)
+	add	w0,w0,w5
+	ldr	w5,[sp,#8]
+	str	w8,[sp,#4]
+	ror	w16,w27,#6
+	add	w22,w22,w28			// h+=K[i]
+	ror	w7,w2,#7
+	and	w17,w20,w27
+	ror	w6,w15,#17
+	bic	w28,w21,w27
+	ror	w8,w23,#2
+	add	w22,w22,w0			// h+=X[i]
+	eor	w16,w16,w27,ror#11
+	eor	w7,w7,w2,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w23,w24			// a^b, b^c in next round
+	eor	w16,w16,w27,ror#25	// Sigma1(e)
+	eor	w8,w8,w23,ror#13
+	add	w22,w22,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w6,w6,w15,ror#19
+	eor	w7,w7,w2,lsr#3	// sigma0(X[i+1])
+	add	w22,w22,w16			// h+=Sigma1(e)
+	eor	w19,w19,w24			// Maj(a,b,c)
+	eor	w17,w8,w23,ror#22	// Sigma0(a)
+	eor	w6,w6,w15,lsr#10	// sigma1(X[i+14])
+	add	w1,w1,w10
+	add	w26,w26,w22			// d+=h
+	add	w22,w22,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w1,w1,w7
+	add	w22,w22,w17			// h+=Sigma0(a)
+	add	w1,w1,w6
+	ldr	w6,[sp,#12]
+	str	w9,[sp,#8]
+	ror	w16,w26,#6
+	add	w21,w21,w19			// h+=K[i]
+	ror	w8,w3,#7
+	and	w17,w27,w26
+	ror	w7,w0,#17
+	bic	w19,w20,w26
+	ror	w9,w22,#2
+	add	w21,w21,w1			// h+=X[i]
+	eor	w16,w16,w26,ror#11
+	eor	w8,w8,w3,ror#18
+	orr	w17,w17,w19			// Ch(e,f,g)
+	eor	w19,w22,w23			// a^b, b^c in next round
+	eor	w16,w16,w26,ror#25	// Sigma1(e)
+	eor	w9,w9,w22,ror#13
+	add	w21,w21,w17			// h+=Ch(e,f,g)
+	and	w28,w28,w19			// (b^c)&=(a^b)
+	eor	w7,w7,w0,ror#19
+	eor	w8,w8,w3,lsr#3	// sigma0(X[i+1])
+	add	w21,w21,w16			// h+=Sigma1(e)
+	eor	w28,w28,w23			// Maj(a,b,c)
+	eor	w17,w9,w22,ror#22	// Sigma0(a)
+	eor	w7,w7,w0,lsr#10	// sigma1(X[i+14])
+	add	w2,w2,w11
+	add	w25,w25,w21			// d+=h
+	add	w21,w21,w28			// h+=Maj(a,b,c)
+	ldr	w28,[x30],#4		// *K++, w19 in next round
+	add	w2,w2,w8
+	add	w21,w21,w17			// h+=Sigma0(a)
+	add	w2,w2,w7
+	ldr	w7,[sp,#0]
+	str	w10,[sp,#12]
+	ror	w16,w25,#6
+	add	w20,w20,w28			// h+=K[i]
+	ror	w9,w4,#7
+	and	w17,w26,w25
+	ror	w8,w1,#17
+	bic	w28,w27,w25
+	ror	w10,w21,#2
+	add	w20,w20,w2			// h+=X[i]
+	eor	w16,w16,w25,ror#11
+	eor	w9,w9,w4,ror#18
+	orr	w17,w17,w28			// Ch(e,f,g)
+	eor	w28,w21,w22			// a^b, b^c in next round
+	eor	w16,w16,w25,ror#25	// Sigma1(e)
+	eor	w10,w10,w21,ror#13
+	add	w20,w20,w17			// h+=Ch(e,f,g)
+	and	w19,w19,w28			// (b^c)&=(a^b)
+	eor	w8,w8,w1,ror#19
+	eor	w9,w9,w4,lsr#3	// sigma0(X[i+1])
+	add	w20,w20,w16			// h+=Sigma1(e)
+	eor	w19,w19,w22			// Maj(a,b,c)
+	eor	w17,w10,w21,ror#22	// Sigma0(a)
+	eor	w8,w8,w1,lsr#10	// sigma1(X[i+14])
+	add	w3,w3,w12
+	add	w24,w24,w20			// d+=h
+	add	w20,w20,w19			// h+=Maj(a,b,c)
+	ldr	w19,[x30],#4		// *K++, w28 in next round
+	add	w3,w3,w9
+	add	w20,w20,w17			// h+=Sigma0(a)
+	add	w3,w3,w8
+	cbnz	w19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#260		// rewind
+
+	ldp	w3,w4,[x0]
+	ldp	w5,w6,[x0,#2*4]
+	add	x1,x1,#14*4			// advance input pointer
+	ldp	w7,w8,[x0,#4*4]
+	add	w20,w20,w3
+	ldp	w9,w10,[x0,#6*4]
+	add	w21,w21,w4
+	add	w22,w22,w5
+	add	w23,w23,w6
+	stp	w20,w21,[x0]
+	add	w24,w24,w7
+	add	w25,w25,w8
+	stp	w22,w23,[x0,#2*4]
+	add	w26,w26,w9
+	add	w27,w27,w10
+	cmp	x1,x2
+	stp	w24,w25,[x0,#4*4]
+	stp	w26,w27,[x0,#6*4]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*4
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK256:
+.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long	0	//terminator
+
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+
+.align	6
+sha256_block_armv8:
+Lv8_entry:
+	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v0.4s,v1.4s},[x0]
+	adrp	x3,LK256@PAGE
+	add	x3,x3,LK256@PAGEOFF
+
+Loop_hw:
+	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+	sub	x2,x2,#1
+	ld1	{v16.4s},[x3],#16
+	rev32	v4.16b,v4.16b
+	rev32	v5.16b,v5.16b
+	rev32	v6.16b,v6.16b
+	rev32	v7.16b,v7.16b
+	orr	v18.16b,v0.16b,v0.16b		// offload
+	orr	v19.16b,v1.16b,v1.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+.long	0x5e2828a4	//sha256su0 v4.16b,v5.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e0760c4	//sha256su1 v4.16b,v6.16b,v7.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+.long	0x5e2828c5	//sha256su0 v5.16b,v6.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0460e5	//sha256su1 v5.16b,v7.16b,v4.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v6.4s
+.long	0x5e2828e6	//sha256su0 v6.16b,v7.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+.long	0x5e056086	//sha256su1 v6.16b,v4.16b,v5.16b
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v7.4s
+.long	0x5e282887	//sha256su0 v7.16b,v4.16b
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+.long	0x5e0660a7	//sha256su1 v7.16b,v5.16b,v6.16b
+	ld1	{v17.4s},[x3],#16
+	add	v16.4s,v16.4s,v4.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	ld1	{v16.4s},[x3],#16
+	add	v17.4s,v17.4s,v5.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	ld1	{v17.4s},[x3]
+	add	v16.4s,v16.4s,v6.4s
+	sub	x3,x3,#64*4-16	// rewind
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e104020	//sha256h v0.16b,v1.16b,v16.4s
+.long	0x5e105041	//sha256h2 v1.16b,v2.16b,v16.4s
+
+	add	v17.4s,v17.4s,v7.4s
+	orr	v2.16b,v0.16b,v0.16b
+.long	0x5e114020	//sha256h v0.16b,v1.16b,v17.4s
+.long	0x5e115041	//sha256h2 v1.16b,v2.16b,v17.4s
+
+	add	v0.4s,v0.4s,v18.4s
+	add	v1.4s,v1.4s,v19.4s
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.4s,v1.4s},[x0]
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/sha512-armv8.S b/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
new file mode 100644
index 0000000..b2d366d
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -0,0 +1,1614 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significantly faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.private_extern	_OPENSSL_armcap_P
+.globl	_sha512_block_data_order
+.private_extern	_sha512_block_data_order
+
+.align	6
+_sha512_block_data_order:
+	AARCH64_VALID_CALL_TARGET
+#ifndef	__KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+	adrp	x16,:pg_hi21_nc:_OPENSSL_armcap_P
+#else
+	adrp	x16,_OPENSSL_armcap_P@PAGE
+#endif
+	ldr	w16,[x16,_OPENSSL_armcap_P@PAGEOFF]
+	tst	w16,#ARMV8_SHA512
+	b.ne	Lv8_entry
+#endif
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-128]!
+	add	x29,sp,#0
+
+	stp	x19,x20,[sp,#16]
+	stp	x21,x22,[sp,#32]
+	stp	x23,x24,[sp,#48]
+	stp	x25,x26,[sp,#64]
+	stp	x27,x28,[sp,#80]
+	sub	sp,sp,#4*8
+
+	ldp	x20,x21,[x0]				// load context
+	ldp	x22,x23,[x0,#2*8]
+	ldp	x24,x25,[x0,#4*8]
+	add	x2,x1,x2,lsl#7	// end of input
+	ldp	x26,x27,[x0,#6*8]
+	adrp	x30,LK512@PAGE
+	add	x30,x30,LK512@PAGEOFF
+	stp	x0,x2,[x29,#96]
+
+Loop:
+	ldp	x3,x4,[x1],#2*8
+	ldr	x19,[x30],#8			// *K++
+	eor	x28,x21,x22				// magic seed
+	str	x1,[x29,#112]
+#ifndef	__AARCH64EB__
+	rev	x3,x3			// 0
+#endif
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x6,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x3			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x4,x4			// 1
+#endif
+	ldp	x5,x6,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x7,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x4			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x5,x5			// 2
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x8,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x5			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x6,x6			// 3
+#endif
+	ldp	x7,x8,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x9,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x6			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x7,x7			// 4
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x10,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x7			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x10,ror#18	// Sigma1(e)
+	ror	x10,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x10,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x8,x8			// 5
+#endif
+	ldp	x9,x10,[x1],#2*8
+	add	x23,x23,x17			// h+=Sigma0(a)
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x11,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x8			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x11,ror#18	// Sigma1(e)
+	ror	x11,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x11,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x9,x9			// 6
+#endif
+	add	x22,x22,x17			// h+=Sigma0(a)
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x12,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x9			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x12,ror#18	// Sigma1(e)
+	ror	x12,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x12,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x10,x10			// 7
+#endif
+	ldp	x11,x12,[x1],#2*8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	eor	x13,x25,x25,ror#23
+	and	x17,x26,x25
+	bic	x28,x27,x25
+	add	x20,x20,x10			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x13,ror#18	// Sigma1(e)
+	ror	x13,x21,#28
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	eor	x17,x21,x21,ror#5
+	add	x20,x20,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x24,x24,x20			// d+=h
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x13,x17,ror#34	// Sigma0(a)
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x20,x20,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x11,x11			// 8
+#endif
+	add	x20,x20,x17			// h+=Sigma0(a)
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	eor	x14,x24,x24,ror#23
+	and	x17,x25,x24
+	bic	x19,x26,x24
+	add	x27,x27,x11			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x14,ror#18	// Sigma1(e)
+	ror	x14,x20,#28
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	eor	x17,x20,x20,ror#5
+	add	x27,x27,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x23,x23,x27			// d+=h
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x14,x17,ror#34	// Sigma0(a)
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x27,x27,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x12,x12			// 9
+#endif
+	ldp	x13,x14,[x1],#2*8
+	add	x27,x27,x17			// h+=Sigma0(a)
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	eor	x15,x23,x23,ror#23
+	and	x17,x24,x23
+	bic	x28,x25,x23
+	add	x26,x26,x12			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x15,ror#18	// Sigma1(e)
+	ror	x15,x27,#28
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	eor	x17,x27,x27,ror#5
+	add	x26,x26,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x22,x22,x26			// d+=h
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x15,x17,ror#34	// Sigma0(a)
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x26,x26,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x13,x13			// 10
+#endif
+	add	x26,x26,x17			// h+=Sigma0(a)
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	eor	x0,x22,x22,ror#23
+	and	x17,x23,x22
+	bic	x19,x24,x22
+	add	x25,x25,x13			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x0,ror#18	// Sigma1(e)
+	ror	x0,x26,#28
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	eor	x17,x26,x26,ror#5
+	add	x25,x25,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x21,x21,x25			// d+=h
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x0,x17,ror#34	// Sigma0(a)
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x25,x25,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x14,x14			// 11
+#endif
+	ldp	x15,x0,[x1],#2*8
+	add	x25,x25,x17			// h+=Sigma0(a)
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	eor	x6,x21,x21,ror#23
+	and	x17,x22,x21
+	bic	x28,x23,x21
+	add	x24,x24,x14			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x6,ror#18	// Sigma1(e)
+	ror	x6,x25,#28
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	eor	x17,x25,x25,ror#5
+	add	x24,x24,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x20,x20,x24			// d+=h
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x17,ror#34	// Sigma0(a)
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x24,x24,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x15,x15			// 12
+#endif
+	add	x24,x24,x17			// h+=Sigma0(a)
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	eor	x7,x20,x20,ror#23
+	and	x17,x21,x20
+	bic	x19,x22,x20
+	add	x23,x23,x15			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x7,ror#18	// Sigma1(e)
+	ror	x7,x24,#28
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	eor	x17,x24,x24,ror#5
+	add	x23,x23,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x27,x27,x23			// d+=h
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x17,ror#34	// Sigma0(a)
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x23,x23,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x0,x0			// 13
+#endif
+	ldp	x1,x2,[x1]
+	add	x23,x23,x17			// h+=Sigma0(a)
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	eor	x8,x27,x27,ror#23
+	and	x17,x20,x27
+	bic	x28,x21,x27
+	add	x22,x22,x0			// h+=X[i]
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x8,ror#18	// Sigma1(e)
+	ror	x8,x23,#28
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	eor	x17,x23,x23,ror#5
+	add	x22,x22,x16			// h+=Sigma1(e)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	add	x26,x26,x22			// d+=h
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x17,ror#34	// Sigma0(a)
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	//add	x22,x22,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x1,x1			// 14
+#endif
+	ldr	x6,[sp,#24]
+	add	x22,x22,x17			// h+=Sigma0(a)
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	eor	x9,x26,x26,ror#23
+	and	x17,x27,x26
+	bic	x19,x20,x26
+	add	x21,x21,x1			// h+=X[i]
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x9,ror#18	// Sigma1(e)
+	ror	x9,x22,#28
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	eor	x17,x22,x22,ror#5
+	add	x21,x21,x16			// h+=Sigma1(e)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	add	x25,x25,x21			// d+=h
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x17,ror#34	// Sigma0(a)
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	//add	x21,x21,x17			// h+=Sigma0(a)
+#ifndef	__AARCH64EB__
+	rev	x2,x2			// 15
+#endif
+	ldr	x7,[sp,#0]
+	add	x21,x21,x17			// h+=Sigma0(a)
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+Loop_16_xx:
+	ldr	x8,[sp,#8]
+	str	x11,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x10,x5,#1
+	and	x17,x25,x24
+	ror	x9,x2,#19
+	bic	x19,x26,x24
+	ror	x11,x20,#28
+	add	x27,x27,x3			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x10,x10,x5,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x11,x11,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x9,x9,x2,ror#61
+	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x11,x20,ror#39	// Sigma0(a)
+	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
+	add	x4,x4,x13
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x4,x4,x10
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x4,x4,x9
+	ldr	x9,[sp,#16]
+	str	x12,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x11,x6,#1
+	and	x17,x24,x23
+	ror	x10,x3,#19
+	bic	x28,x25,x23
+	ror	x12,x27,#28
+	add	x26,x26,x4			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x11,x11,x6,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x12,x12,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x10,x10,x3,ror#61
+	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x12,x27,ror#39	// Sigma0(a)
+	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
+	add	x5,x5,x14
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x5,x5,x11
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x5,x5,x10
+	ldr	x10,[sp,#24]
+	str	x13,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x12,x7,#1
+	and	x17,x23,x22
+	ror	x11,x4,#19
+	bic	x19,x24,x22
+	ror	x13,x26,#28
+	add	x25,x25,x5			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x12,x12,x7,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x13,x13,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x11,x11,x4,ror#61
+	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x13,x26,ror#39	// Sigma0(a)
+	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
+	add	x6,x6,x15
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x6,x6,x12
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x6,x6,x11
+	ldr	x11,[sp,#0]
+	str	x14,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x13,x8,#1
+	and	x17,x22,x21
+	ror	x12,x5,#19
+	bic	x28,x23,x21
+	ror	x14,x25,#28
+	add	x24,x24,x6			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x13,x13,x8,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x14,x14,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x12,x12,x5,ror#61
+	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x14,x25,ror#39	// Sigma0(a)
+	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
+	add	x7,x7,x0
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x7,x7,x13
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x7,x7,x12
+	ldr	x12,[sp,#8]
+	str	x15,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x14,x9,#1
+	and	x17,x21,x20
+	ror	x13,x6,#19
+	bic	x19,x22,x20
+	ror	x15,x24,#28
+	add	x23,x23,x7			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x14,x14,x9,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x15,x15,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x13,x13,x6,ror#61
+	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x15,x24,ror#39	// Sigma0(a)
+	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
+	add	x8,x8,x1
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x8,x8,x14
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x8,x8,x13
+	ldr	x13,[sp,#16]
+	str	x0,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x15,x10,#1
+	and	x17,x20,x27
+	ror	x14,x7,#19
+	bic	x28,x21,x27
+	ror	x0,x23,#28
+	add	x22,x22,x8			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x15,x15,x10,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x0,x0,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x14,x14,x7,ror#61
+	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x0,x23,ror#39	// Sigma0(a)
+	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
+	add	x9,x9,x2
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x9,x9,x15
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x9,x9,x14
+	ldr	x14,[sp,#24]
+	str	x1,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x0,x11,#1
+	and	x17,x27,x26
+	ror	x15,x8,#19
+	bic	x19,x20,x26
+	ror	x1,x22,#28
+	add	x21,x21,x9			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x0,x0,x11,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x1,x1,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x15,x15,x8,ror#61
+	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x1,x22,ror#39	// Sigma0(a)
+	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
+	add	x10,x10,x3
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x10,x10,x0
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x10,x10,x15
+	ldr	x15,[sp,#0]
+	str	x2,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x1,x12,#1
+	and	x17,x26,x25
+	ror	x0,x9,#19
+	bic	x28,x27,x25
+	ror	x2,x21,#28
+	add	x20,x20,x10			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x1,x1,x12,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x2,x2,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x0,x0,x9,ror#61
+	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x2,x21,ror#39	// Sigma0(a)
+	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
+	add	x11,x11,x4
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x11,x11,x1
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x11,x11,x0
+	ldr	x0,[sp,#8]
+	str	x3,[sp,#0]
+	ror	x16,x24,#14
+	add	x27,x27,x19			// h+=K[i]
+	ror	x2,x13,#1
+	and	x17,x25,x24
+	ror	x1,x10,#19
+	bic	x19,x26,x24
+	ror	x3,x20,#28
+	add	x27,x27,x11			// h+=X[i]
+	eor	x16,x16,x24,ror#18
+	eor	x2,x2,x13,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x20,x21			// a^b, b^c in next round
+	eor	x16,x16,x24,ror#41	// Sigma1(e)
+	eor	x3,x3,x20,ror#34
+	add	x27,x27,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x1,x1,x10,ror#61
+	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
+	add	x27,x27,x16			// h+=Sigma1(e)
+	eor	x28,x28,x21			// Maj(a,b,c)
+	eor	x17,x3,x20,ror#39	// Sigma0(a)
+	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
+	add	x12,x12,x5
+	add	x23,x23,x27			// d+=h
+	add	x27,x27,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x12,x12,x2
+	add	x27,x27,x17			// h+=Sigma0(a)
+	add	x12,x12,x1
+	ldr	x1,[sp,#16]
+	str	x4,[sp,#8]
+	ror	x16,x23,#14
+	add	x26,x26,x28			// h+=K[i]
+	ror	x3,x14,#1
+	and	x17,x24,x23
+	ror	x2,x11,#19
+	bic	x28,x25,x23
+	ror	x4,x27,#28
+	add	x26,x26,x12			// h+=X[i]
+	eor	x16,x16,x23,ror#18
+	eor	x3,x3,x14,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x27,x20			// a^b, b^c in next round
+	eor	x16,x16,x23,ror#41	// Sigma1(e)
+	eor	x4,x4,x27,ror#34
+	add	x26,x26,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x2,x2,x11,ror#61
+	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
+	add	x26,x26,x16			// h+=Sigma1(e)
+	eor	x19,x19,x20			// Maj(a,b,c)
+	eor	x17,x4,x27,ror#39	// Sigma0(a)
+	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
+	add	x13,x13,x6
+	add	x22,x22,x26			// d+=h
+	add	x26,x26,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x13,x13,x3
+	add	x26,x26,x17			// h+=Sigma0(a)
+	add	x13,x13,x2
+	ldr	x2,[sp,#24]
+	str	x5,[sp,#16]
+	ror	x16,x22,#14
+	add	x25,x25,x19			// h+=K[i]
+	ror	x4,x15,#1
+	and	x17,x23,x22
+	ror	x3,x12,#19
+	bic	x19,x24,x22
+	ror	x5,x26,#28
+	add	x25,x25,x13			// h+=X[i]
+	eor	x16,x16,x22,ror#18
+	eor	x4,x4,x15,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x26,x27			// a^b, b^c in next round
+	eor	x16,x16,x22,ror#41	// Sigma1(e)
+	eor	x5,x5,x26,ror#34
+	add	x25,x25,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x3,x3,x12,ror#61
+	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
+	add	x25,x25,x16			// h+=Sigma1(e)
+	eor	x28,x28,x27			// Maj(a,b,c)
+	eor	x17,x5,x26,ror#39	// Sigma0(a)
+	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
+	add	x14,x14,x7
+	add	x21,x21,x25			// d+=h
+	add	x25,x25,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x14,x14,x4
+	add	x25,x25,x17			// h+=Sigma0(a)
+	add	x14,x14,x3
+	ldr	x3,[sp,#0]
+	str	x6,[sp,#24]
+	ror	x16,x21,#14
+	add	x24,x24,x28			// h+=K[i]
+	ror	x5,x0,#1
+	and	x17,x22,x21
+	ror	x4,x13,#19
+	bic	x28,x23,x21
+	ror	x6,x25,#28
+	add	x24,x24,x14			// h+=X[i]
+	eor	x16,x16,x21,ror#18
+	eor	x5,x5,x0,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x25,x26			// a^b, b^c in next round
+	eor	x16,x16,x21,ror#41	// Sigma1(e)
+	eor	x6,x6,x25,ror#34
+	add	x24,x24,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x4,x4,x13,ror#61
+	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
+	add	x24,x24,x16			// h+=Sigma1(e)
+	eor	x19,x19,x26			// Maj(a,b,c)
+	eor	x17,x6,x25,ror#39	// Sigma0(a)
+	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
+	add	x15,x15,x8
+	add	x20,x20,x24			// d+=h
+	add	x24,x24,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x15,x15,x5
+	add	x24,x24,x17			// h+=Sigma0(a)
+	add	x15,x15,x4
+	ldr	x4,[sp,#8]
+	str	x7,[sp,#0]
+	ror	x16,x20,#14
+	add	x23,x23,x19			// h+=K[i]
+	ror	x6,x1,#1
+	and	x17,x21,x20
+	ror	x5,x14,#19
+	bic	x19,x22,x20
+	ror	x7,x24,#28
+	add	x23,x23,x15			// h+=X[i]
+	eor	x16,x16,x20,ror#18
+	eor	x6,x6,x1,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x24,x25			// a^b, b^c in next round
+	eor	x16,x16,x20,ror#41	// Sigma1(e)
+	eor	x7,x7,x24,ror#34
+	add	x23,x23,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x5,x5,x14,ror#61
+	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
+	add	x23,x23,x16			// h+=Sigma1(e)
+	eor	x28,x28,x25			// Maj(a,b,c)
+	eor	x17,x7,x24,ror#39	// Sigma0(a)
+	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
+	add	x0,x0,x9
+	add	x27,x27,x23			// d+=h
+	add	x23,x23,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x0,x0,x6
+	add	x23,x23,x17			// h+=Sigma0(a)
+	add	x0,x0,x5
+	ldr	x5,[sp,#16]
+	str	x8,[sp,#8]
+	ror	x16,x27,#14
+	add	x22,x22,x28			// h+=K[i]
+	ror	x7,x2,#1
+	and	x17,x20,x27
+	ror	x6,x15,#19
+	bic	x28,x21,x27
+	ror	x8,x23,#28
+	add	x22,x22,x0			// h+=X[i]
+	eor	x16,x16,x27,ror#18
+	eor	x7,x7,x2,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x23,x24			// a^b, b^c in next round
+	eor	x16,x16,x27,ror#41	// Sigma1(e)
+	eor	x8,x8,x23,ror#34
+	add	x22,x22,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x6,x6,x15,ror#61
+	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
+	add	x22,x22,x16			// h+=Sigma1(e)
+	eor	x19,x19,x24			// Maj(a,b,c)
+	eor	x17,x8,x23,ror#39	// Sigma0(a)
+	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
+	add	x1,x1,x10
+	add	x26,x26,x22			// d+=h
+	add	x22,x22,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x1,x1,x7
+	add	x22,x22,x17			// h+=Sigma0(a)
+	add	x1,x1,x6
+	ldr	x6,[sp,#24]
+	str	x9,[sp,#16]
+	ror	x16,x26,#14
+	add	x21,x21,x19			// h+=K[i]
+	ror	x8,x3,#1
+	and	x17,x27,x26
+	ror	x7,x0,#19
+	bic	x19,x20,x26
+	ror	x9,x22,#28
+	add	x21,x21,x1			// h+=X[i]
+	eor	x16,x16,x26,ror#18
+	eor	x8,x8,x3,ror#8
+	orr	x17,x17,x19			// Ch(e,f,g)
+	eor	x19,x22,x23			// a^b, b^c in next round
+	eor	x16,x16,x26,ror#41	// Sigma1(e)
+	eor	x9,x9,x22,ror#34
+	add	x21,x21,x17			// h+=Ch(e,f,g)
+	and	x28,x28,x19			// (b^c)&=(a^b)
+	eor	x7,x7,x0,ror#61
+	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
+	add	x21,x21,x16			// h+=Sigma1(e)
+	eor	x28,x28,x23			// Maj(a,b,c)
+	eor	x17,x9,x22,ror#39	// Sigma0(a)
+	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
+	add	x2,x2,x11
+	add	x25,x25,x21			// d+=h
+	add	x21,x21,x28			// h+=Maj(a,b,c)
+	ldr	x28,[x30],#8		// *K++, x19 in next round
+	add	x2,x2,x8
+	add	x21,x21,x17			// h+=Sigma0(a)
+	add	x2,x2,x7
+	ldr	x7,[sp,#0]
+	str	x10,[sp,#24]
+	ror	x16,x25,#14
+	add	x20,x20,x28			// h+=K[i]
+	ror	x9,x4,#1
+	and	x17,x26,x25
+	ror	x8,x1,#19
+	bic	x28,x27,x25
+	ror	x10,x21,#28
+	add	x20,x20,x2			// h+=X[i]
+	eor	x16,x16,x25,ror#18
+	eor	x9,x9,x4,ror#8
+	orr	x17,x17,x28			// Ch(e,f,g)
+	eor	x28,x21,x22			// a^b, b^c in next round
+	eor	x16,x16,x25,ror#41	// Sigma1(e)
+	eor	x10,x10,x21,ror#34
+	add	x20,x20,x17			// h+=Ch(e,f,g)
+	and	x19,x19,x28			// (b^c)&=(a^b)
+	eor	x8,x8,x1,ror#61
+	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
+	add	x20,x20,x16			// h+=Sigma1(e)
+	eor	x19,x19,x22			// Maj(a,b,c)
+	eor	x17,x10,x21,ror#39	// Sigma0(a)
+	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
+	add	x3,x3,x12
+	add	x24,x24,x20			// d+=h
+	add	x20,x20,x19			// h+=Maj(a,b,c)
+	ldr	x19,[x30],#8		// *K++, x28 in next round
+	add	x3,x3,x9
+	add	x20,x20,x17			// h+=Sigma0(a)
+	add	x3,x3,x8
+	cbnz	x19,Loop_16_xx
+
+	ldp	x0,x2,[x29,#96]
+	ldr	x1,[x29,#112]
+	sub	x30,x30,#648		// rewind
+
+	ldp	x3,x4,[x0]
+	ldp	x5,x6,[x0,#2*8]
+	add	x1,x1,#14*8			// advance input pointer
+	ldp	x7,x8,[x0,#4*8]
+	add	x20,x20,x3
+	ldp	x9,x10,[x0,#6*8]
+	add	x21,x21,x4
+	add	x22,x22,x5
+	add	x23,x23,x6
+	stp	x20,x21,[x0]
+	add	x24,x24,x7
+	add	x25,x25,x8
+	stp	x22,x23,[x0,#2*8]
+	add	x26,x26,x9
+	add	x27,x27,x10
+	cmp	x1,x2
+	stp	x24,x25,[x0,#4*8]
+	stp	x26,x27,[x0,#6*8]
+	b.ne	Loop
+
+	ldp	x19,x20,[x29,#16]
+	add	sp,sp,#4*8
+	ldp	x21,x22,[x29,#32]
+	ldp	x23,x24,[x29,#48]
+	ldp	x25,x26,[x29,#64]
+	ldp	x27,x28,[x29,#80]
+	ldp	x29,x30,[sp],#128
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.section	__TEXT,__const
+.align	6
+
+LK512:
+.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad	0x3956c25bf348b538,0x59f111f1b605d019
+.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad	0xd807aa98a3030242,0x12835b0145706fbe
+.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad	0x06ca6351e003826f,0x142929670a0e6e70
+.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad	0x81c2c92e47edaee6,0x92722c851482353b
+.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad	0xd192e819d6ef5218,0xd69906245565a910
+.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad	0x90befffa23631e28,0xa4506cebde82bde9
+.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad	0xca273eceea26619c,0xd186b8c721c0c207
+.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad	0x113f9804bef90dae,0x1b710b35131c471b
+.quad	0x28db77f523047d84,0x32caab7b40c72493
+.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad	0	// terminator
+
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+.text
+#ifndef	__KERNEL__
+
+.align	6
+sha512_block_armv8:
+Lv8_entry:
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
+	adrp	x3,LK512@PAGE
+	add	x3,x3,LK512@PAGEOFF
+
+	rev64	v16.16b,v16.16b
+	rev64	v17.16b,v17.16b
+	rev64	v18.16b,v18.16b
+	rev64	v19.16b,v19.16b
+	rev64	v20.16b,v20.16b
+	rev64	v21.16b,v21.16b
+	rev64	v22.16b,v22.16b
+	rev64	v23.16b,v23.16b
+	b	Loop_hw
+
+.align	4
+Loop_hw:
+	ld1	{v24.2d},[x3],#16
+	subs	x2,x2,#1
+	sub	x4,x1,#128
+	orr	v26.16b,v0.16b,v0.16b			// offload
+	orr	v27.16b,v1.16b,v1.16b
+	orr	v28.16b,v2.16b,v2.16b
+	orr	v29.16b,v3.16b,v3.16b
+	csel	x1,x1,x4,ne			// conditional rewind
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08230	//sha512su0 v16.16b,v17.16b
+	ext	v7.16b,v20.16b,v21.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08251	//sha512su0 v17.16b,v18.16b
+	ext	v7.16b,v21.16b,v22.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec08272	//sha512su0 v18.16b,v19.16b
+	ext	v7.16b,v22.16b,v23.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08293	//sha512su0 v19.16b,v20.16b
+	ext	v7.16b,v23.16b,v16.16b,#8
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+.long	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082b4	//sha512su0 v20.16b,v21.16b
+	ext	v7.16b,v16.16b,v17.16b,#8
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+.long	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec082d5	//sha512su0 v21.16b,v22.16b
+	ext	v7.16b,v17.16b,v18.16b,#8
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+.long	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v25.2d},[x3],#16
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xcec082f6	//sha512su0 v22.16b,v23.16b
+	ext	v7.16b,v18.16b,v19.16b,#8
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+.long	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v24.2d},[x3],#16
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xcec08217	//sha512su0 v23.16b,v16.16b
+	ext	v7.16b,v19.16b,v20.16b,#8
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+.long	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v16.2d
+	ld1	{v16.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v16.16b,v16.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v17.2d
+	ld1	{v17.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v17.16b,v17.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v18.2d
+	ld1	{v18.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v18.16b,v18.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v19.2d
+	ld1	{v19.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v2.16b,v3.16b,#8
+	ext	v6.16b,v1.16b,v2.16b,#8
+	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
+	rev64	v19.16b,v19.16b
+	add	v4.2d,v1.2d,v3.2d		// "D + T1"
+.long	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v20.2d
+	ld1	{v20.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v4.16b,v2.16b,#8
+	ext	v6.16b,v0.16b,v4.16b,#8
+	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
+	rev64	v20.16b,v20.16b
+	add	v1.2d,v0.2d,v2.2d		// "D + T1"
+.long	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
+	ld1	{v24.2d},[x3],#16
+	add	v25.2d,v25.2d,v21.2d
+	ld1	{v21.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v1.16b,v4.16b,#8
+	ext	v6.16b,v3.16b,v1.16b,#8
+	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
+	rev64	v21.16b,v21.16b
+	add	v0.2d,v3.2d,v4.2d		// "D + T1"
+.long	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
+	ld1	{v25.2d},[x3],#16
+	add	v24.2d,v24.2d,v22.2d
+	ld1	{v22.16b},[x1],#16		// load next input
+	ext	v24.16b,v24.16b,v24.16b,#8
+	ext	v5.16b,v0.16b,v1.16b,#8
+	ext	v6.16b,v2.16b,v0.16b,#8
+	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
+.long	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
+	rev64	v22.16b,v22.16b
+	add	v3.2d,v2.2d,v1.2d		// "D + T1"
+.long	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
+	sub	x3,x3,#80*8	// rewind
+	add	v25.2d,v25.2d,v23.2d
+	ld1	{v23.16b},[x1],#16		// load next input
+	ext	v25.16b,v25.16b,v25.16b,#8
+	ext	v5.16b,v3.16b,v0.16b,#8
+	ext	v6.16b,v4.16b,v3.16b,#8
+	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
+.long	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
+	rev64	v23.16b,v23.16b
+	add	v2.2d,v4.2d,v0.2d		// "D + T1"
+.long	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
+	add	v0.2d,v0.2d,v26.2d			// accumulate
+	add	v1.2d,v1.2d,v27.2d
+	add	v2.2d,v2.2d,v28.2d
+	add	v3.2d,v3.2d,v29.2d
+
+	cbnz	x2,Loop_hw
+
+	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
+
+	ldr	x29,[sp],#16
+	ret
+
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S b/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
new file mode 100644
index 0000000..6dfc25d
--- /dev/null
+++ b/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S
@@ -0,0 +1,1232 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.section	__TEXT,__const
+
+
+.align	7	// totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:	//	mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:	//	mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:	//	sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+Lk_inv:	//	inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:	//	input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:	//	sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:	//	sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:	//	sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+//  Decryption stuff
+//
+Lk_dipt:	//	decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:	//	decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:	//	decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:	//	decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:	//	decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:	//	decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+//  Key schedule constants
+//
+Lk_dksd:	//	decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:	//	decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:	//	decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:	//	rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:	//	output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+
+.text
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+
+.align	4
+_vpaes_encrypt_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v17.16b, #0x0f
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
+	ret
+
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+
+.align	4
+_vpaes_encrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	sub	w8, w8, #1			// nr--
+
+Lenc_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	ret
+
+
+.globl	_vpaes_encrypt
+.private_extern	_vpaes_encrypt
+
+.align	4
+_vpaes_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_encrypt_preheat
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+
+.align	4
+_vpaes_encrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+	adrp	x11, Lk_mc_forward@PAGE+16
+	add	x11, x11, Lk_mc_forward@PAGEOFF+16
+						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
+	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b,  v17.16b
+	ushr	v8.16b,  v15.16b,  #4
+	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
+	tbl	v9.16b,  {v20.16b}, v9.16b
+						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
+	tbl	v10.16b, {v21.16b}, v8.16b
+	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
+	eor	v8.16b,  v9.16b,   v16.16b
+	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,   v10.16b
+	b	Lenc_2x_entry
+
+.align	4
+Lenc_2x_loop:
+	// middle of middle round
+	add	x10, x11, #0x40
+	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	tbl	v12.16b, {v25.16b}, v10.16b
+	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v24.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	tbl	v13.16b, {v27.16b}, v10.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	tbl	v10.16b, {v26.16b}, v11.16b
+	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	tbl	v11.16b, {v8.16b}, v1.16b
+	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	eor	v10.16b, v10.16b, v13.16b
+	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	tbl	v8.16b,  {v8.16b}, v4.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	eor	v11.16b, v11.16b, v10.16b
+	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	tbl	v12.16b, {v11.16b},v1.16b
+	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	eor	v8.16b,  v8.16b,  v11.16b
+	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	eor	v8.16b,  v8.16b,  v12.16b
+	sub	w8, w8, #1			// nr--
+
+Lenc_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
+	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b, v17.16b
+	ushr	v8.16b,  v8.16b, #4
+	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	tbl	v13.16b, {v19.16b},v9.16b
+	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,  v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v13.16b
+	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v13.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
+	cbnz	w8, Lenc_2x_loop
+
+	// middle of last round
+	add	x10, x11, #0x80
+						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	tbl	v8.16b,  {v23.16b}, v11.16b
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	eor	v8.16b,  v8.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v1.16b
+	ret
+
+
+
+.align	4
+_vpaes_decrypt_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v17.16b, #0x0f
+	adrp	x11, Lk_dipt@PAGE
+	add	x11, x11, Lk_dipt@PAGEOFF
+	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
+	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
+	ret
+
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+
+.align	4
+_vpaes_decrypt_core:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr@PAGE
+	add	x10, x10, Lk_sr@PAGEOFF
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward@PAGE+48
+	add	x10, x10, Lk_mc_forward@PAGEOFF+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	b	Ldec_entry
+
+.align	4
+Ldec_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_entry:
+	// top of round
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+.globl	_vpaes_decrypt
+.private_extern	_vpaes_decrypt
+
+.align	4
+_vpaes_decrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	ld1	{v7.16b}, [x0]
+	bl	_vpaes_decrypt_preheat
+	bl	_vpaes_decrypt_core
+	st1	{v0.16b}, [x1]
+
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+// v14-v15 input, v0-v1 output
+
+.align	4
+_vpaes_decrypt_2x:
+	mov	x9, x2
+	ldr	w8, [x2,#240]			// pull rounds
+
+						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
+	eor	x11, x11, #0x30			// xor		$0x30,	%r11
+	adrp	x10, Lk_sr@PAGE
+	add	x10, x10, Lk_sr@PAGEOFF
+	and	x11, x11, #0x30			// and		$0x30,	%r11
+	add	x11, x11, x10
+	adrp	x10, Lk_mc_forward@PAGE+48
+	add	x10, x10, Lk_mc_forward@PAGEOFF+48
+
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
+	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+	and	v9.16b,  v15.16b, v17.16b
+	ushr	v8.16b,  v15.16b, #4
+	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+	tbl	v10.16b, {v20.16b},v9.16b
+	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	tbl	v8.16b,  {v21.16b},v8.16b
+	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
+	eor	v10.16b, v10.16b, v16.16b
+	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
+	eor	v8.16b,  v8.16b,  v10.16b
+	b	Ldec_2x_entry
+
+.align	4
+Ldec_2x_loop:
+//
+//  Inverse mix columns
+//
+						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	tbl	v12.16b, {v24.16b}, v10.16b
+	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	tbl	v9.16b,  {v25.16b}, v11.16b
+	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
+	eor	v8.16b,  v12.16b, v16.16b
+						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+
+	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	tbl	v12.16b, {v26.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	tbl	v9.16b,  {v27.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	tbl	v12.16b, {v28.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	tbl	v9.16b,  {v29.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
+
+	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	tbl	v12.16b, {v30.16b}, v10.16b
+	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	tbl	v8.16b,  {v8.16b},v5.16b
+	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	tbl	v9.16b,  {v31.16b}, v11.16b
+	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	eor	v8.16b,  v8.16b,  v12.16b
+	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
+	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	eor	v8.16b,  v8.16b,  v9.16b
+	sub	w8, w8, #1			// sub		$1,%rax			# nr--
+
+Ldec_2x_entry:
+	// top of round
+	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
+	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	and	v9.16b,  v8.16b,  v17.16b
+	ushr	v8.16b,  v8.16b,  #4
+	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	tbl	v10.16b, {v19.16b},v9.16b
+	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
+	eor	v9.16b,	 v9.16b,  v8.16b
+	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	tbl	v11.16b, {v18.16b},v8.16b
+	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	tbl	v12.16b, {v18.16b},v9.16b
+	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	eor	v11.16b, v11.16b, v10.16b
+	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	eor	v12.16b, v12.16b, v10.16b
+	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	tbl	v10.16b, {v18.16b},v11.16b
+	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	tbl	v11.16b, {v18.16b},v12.16b
+	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
+	eor	v10.16b, v10.16b, v9.16b
+	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	eor	v11.16b, v11.16b, v8.16b
+	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
+	cbnz	w8, Ldec_2x_loop
+
+	// middle of last round
+						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	tbl	v12.16b, {v22.16b}, v10.16b
+						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	tbl	v9.16b,  {v23.16b}, v11.16b
+	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	eor	v12.16b, v12.16b, v16.16b
+	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
+	eor	v8.16b,  v9.16b,  v12.16b
+	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
+	tbl	v1.16b,  {v8.16b},v2.16b
+	ret
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+
+.align	4
+_vpaes_key_preheat:
+	adrp	x10, Lk_inv@PAGE
+	add	x10, x10, Lk_inv@PAGEOFF
+	movi	v16.16b, #0x5b			// Lk_s63
+	adrp	x11, Lk_sb1@PAGE
+	add	x11, x11, Lk_sb1@PAGEOFF
+	movi	v17.16b, #0x0f			// Lk_s0F
+	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
+	adrp	x10, Lk_dksd@PAGE
+	add	x10, x10, Lk_dksd@PAGEOFF
+	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
+	adrp	x11, Lk_mc_forward@PAGE
+	add	x11, x11, Lk_mc_forward@PAGEOFF
+	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
+	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
+	ld1	{v8.2d}, [x10]			// Lk_rcon
+	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
+	ret
+
+
+
+.align	4
+_vpaes_schedule_core:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29, x30, [sp,#-16]!
+	add	x29,sp,#0
+
+	bl	_vpaes_key_preheat		// load the tables
+
+	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	// input transform
+	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
+
+	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
+	add	x10, x10, Lk_sr@PAGEOFF
+
+	add	x8, x8, x10
+	cbnz	w3, Lschedule_am_decrypting
+
+	// encrypting, output zeroth round key after transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
+	b	Lschedule_go
+
+Lschedule_am_decrypting:
+	// decrypting, output zeroth round key after shiftrows
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	eor	x8, x8, #0x30			// xor	$0x30, %r8
+
+Lschedule_go:
+	cmp	w1, #192			// cmp	$192,	%esi
+	b.hi	Lschedule_256
+	b.eq	Lschedule_192
+	// 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+Lschedule_128:
+	mov	x0, #10			// mov	$10, %esi
+
+Loop_schedule_128:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// write output
+	b	Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	4
+Lschedule_192:
+	sub	x0, x0, #8
+	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	x0, #4			// mov	$4,	%esi
+
+Loop_schedule_192:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_round
+	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		// save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		// save key n+1
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		// save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	4
+Lschedule_256:
+	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	// input transform
+	mov	x0, #7			// mov	$7, %esi
+
+Loop_schedule_256:
+	sub	x0, x0, #1			// dec	%esi
+	bl	_vpaes_schedule_mangle		// output low result
+	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	// high round
+	bl	_vpaes_schedule_round
+	cbz	x0, Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	// low round. swap xmm7 and xmm6
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	movi	v4.16b, #0
+	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
+	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	4
+Lschedule_mangle_last:
+	// schedule last round key from xmm0
+	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	add	x11, x11, Lk_deskew@PAGEOFF
+
+	cbnz	w3, Lschedule_mangle_last_dec
+
+	// encrypting
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
+	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
+	add	x11, x11, Lk_opt@PAGEOFF
+	add	x2, x2, #32			// add	$32,	%rdx
+	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+
+Lschedule_mangle_last_dec:
+	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
+	sub	x2, x2, #16			// add	$-16,	%rdx
+	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	// output transform
+	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	// cleanup
+	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
+	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
+	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
+	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
+	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
+	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+	ldp	x29, x30, [sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+
+.align	4
+_vpaes_schedule_192_smear:
+	movi	v1.16b, #0
+	dup	v0.4s, v7.s[3]
+	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
+	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
+	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	ret
+
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+
+.align	4
+_vpaes_schedule_round:
+	// extract rcon from xmm8
+	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
+	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+
+	// rotate
+	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
+	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	// fall through...
+
+	// low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	// smear xmm7
+	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
+	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
+	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
+
+	// subbytes
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
+	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
+	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
+	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
+	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	// add in smeared stuff
+	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
+	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
+	ret
+
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+
+.align	4
+_vpaes_schedule_transform:
+	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
+	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
+						// vmovdqa	(%r11),	%xmm2 	# lo
+	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
+						// vmovdqa	16(%r11),	%xmm1 # hi
+	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
+	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
+	ret
+
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+
+.align	4
+_vpaes_schedule_mangle:
+	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
+	cbnz	w3, Lschedule_mangle_dec
+
+	// encrypting
+	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
+	add	x2, x2, #16			// add	$16,	%rdx
+	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
+	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
+	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
+	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
+
+	b	Lschedule_mangle_both
+.align	4
+Lschedule_mangle_dec:
+	// inverse mix columns
+						// lea	.Lk_dksd(%rip),%r11
+	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+						// vmovdqa	0x00(%r11),	%xmm2
+	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+						// vmovdqa	0x10(%r11),	%xmm3
+	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x20(%r11),	%xmm2
+	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x30(%r11),	%xmm3
+	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+
+						// vmovdqa	0x40(%r11),	%xmm2
+	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+						// vmovdqa	0x50(%r11),	%xmm3
+	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
+
+						// vmovdqa	0x60(%r11),	%xmm2
+	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
+	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
+						// vmovdqa	0x70(%r11),	%xmm4
+	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
+	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
+	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
+	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	x2, x2, #16			// add	$-16,	%rdx
+
+Lschedule_mangle_both:
+	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
+	add	x8, x8, #48			// add	$-16,	%r8
+	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
+	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
+	ret
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+
+.align	4
+_vpaes_set_encrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	w3, #0		// mov	$0,%ecx
+	mov	x8, #0x30		// mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	x0, x0, x0
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_vpaes_set_decrypt_key
+.private_extern	_vpaes_set_decrypt_key
+
+.align	4
+_vpaes_set_decrypt_key:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+
+	lsr	w9, w1, #5		// shr	$5,%eax
+	add	w9, w9, #5		// $5,%eax
+	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	w9, w9, #4		// shl	$4,%eax
+	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
+	add	x2, x2, x9
+
+	mov	w3, #1		// mov	$1,%ecx
+	lsr	w8, w1, #1		// shr	$1,%r8d
+	and	x8, x8, #32		// and	$32,%r8d
+	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_vpaes_cbc_encrypt
+.private_extern	_vpaes_cbc_encrypt
+
+.align	4
+_vpaes_cbc_encrypt:
+	AARCH64_SIGN_LINK_REGISTER
+	cbz	x2, Lcbc_abort
+	cmp	w5, #0			// check direction
+	b.eq	vpaes_cbc_decrypt
+
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+
+	ld1	{v0.16b}, [x4]	// load ivec
+	bl	_vpaes_encrypt_preheat
+	b	Lcbc_enc_loop
+
+.align	4
+Lcbc_enc_loop:
+	ld1	{v7.16b}, [x0],#16	// load input
+	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
+	bl	_vpaes_encrypt_core
+	st1	{v0.16b}, [x1],#16	// save output
+	subs	x17, x17, #16
+	b.hi	Lcbc_enc_loop
+
+	st1	{v0.16b}, [x4]	// write ivec
+
+	ldp	x29,x30,[sp],#16
+Lcbc_abort:
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+
+.align	4
+vpaes_cbc_decrypt:
+	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+	// only from vpaes_cbc_encrypt which has already signed the return address.
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	mov	x17, x2		// reassign
+	mov	x2,  x3		// reassign
+	ld1	{v6.16b}, [x4]	// load ivec
+	bl	_vpaes_decrypt_preheat
+	tst	x17, #16
+	b.eq	Lcbc_dec_loop2x
+
+	ld1	{v7.16b}, [x0], #16	// load input
+	bl	_vpaes_decrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	orr	v6.16b, v7.16b, v7.16b	// next ivec value
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #16
+	b.ls	Lcbc_dec_done
+
+.align	4
+Lcbc_dec_loop2x:
+	ld1	{v14.16b,v15.16b}, [x0], #32
+	bl	_vpaes_decrypt_2x
+	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
+	eor	v1.16b, v1.16b, v14.16b
+	orr	v6.16b, v15.16b, v15.16b
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #32
+	b.hi	Lcbc_dec_loop2x
+
+Lcbc_dec_done:
+	st1	{v6.16b}, [x4]
+
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern	_vpaes_ctr32_encrypt_blocks
+
+.align	4
+_vpaes_ctr32_encrypt_blocks:
+	AARCH64_SIGN_LINK_REGISTER
+	stp	x29,x30,[sp,#-16]!
+	add	x29,sp,#0
+	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+	stp	d10,d11,[sp,#-16]!
+	stp	d12,d13,[sp,#-16]!
+	stp	d14,d15,[sp,#-16]!
+
+	cbz	x2, Lctr32_done
+
+	// Note, unlike the other functions, x2 here is measured in blocks,
+	// not bytes.
+	mov	x17, x2
+	mov	x2,  x3
+
+	// Load the IV and counter portion.
+	ldr	w6, [x4, #12]
+	ld1	{v7.16b}, [x4]
+
+	bl	_vpaes_encrypt_preheat
+	tst	x17, #1
+	rev	w6, w6		// The counter is big-endian.
+	b.eq	Lctr32_prep_loop
+
+	// Handle one block so the remaining block count is even for
+	// _vpaes_encrypt_2x.
+	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
+	bl	_vpaes_encrypt_core
+	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
+	st1	{v0.16b}, [x1], #16
+	subs	x17, x17, #1
+	// Update the counter.
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v7.s[3], w7
+	b.ls	Lctr32_done
+
+Lctr32_prep_loop:
+	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+	// uses v14 and v15.
+	mov	v15.16b, v7.16b
+	mov	v14.16b, v7.16b
+	add	w6, w6, #1
+	rev	w7, w6
+	mov	v15.s[3], w7
+
+Lctr32_loop:
+	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
+	bl	_vpaes_encrypt_2x
+	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
+	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
+	st1	{v0.16b,v1.16b}, [x1], #32
+	subs	x17, x17, #2
+	// Update the counter.
+	add	w7, w6, #1
+	add	w6, w6, #2
+	rev	w7, w7
+	mov	v14.s[3], w7
+	rev	w7, w6
+	mov	v15.s[3], w7
+	b.hi	Lctr32_loop
+
+Lctr32_done:
+	ldp	d14,d15,[sp],#16
+	ldp	d12,d13,[sp],#16
+	ldp	d10,d11,[sp],#16
+	ldp	d8,d9,[sp],#16
+	ldp	x29,x30,[sp],#16
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-aarch64/crypto/test/trampoline-armv8.S b/apple-aarch64/crypto/test/trampoline-armv8.S
new file mode 100644
index 0000000..325da9b
--- /dev/null
+++ b/apple-aarch64/crypto/test/trampoline-armv8.S
@@ -0,0 +1,758 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+//                              const uint64_t *argv, size_t argc,
+//                              uint64_t unwind);
+
+.globl	_abi_test_trampoline
+.private_extern	_abi_test_trampoline
+.align	4
+_abi_test_trampoline:
+Labi_test_trampoline_begin:
+	AARCH64_SIGN_LINK_REGISTER
+	// Stack layout (low to high addresses)
+	//   x29,x30 (16 bytes)
+	//    d8-d15 (64 bytes)
+	//   x19-x28 (80 bytes)
+	//    x1 (8 bytes)
+	//   padding (8 bytes)
+	stp	x29, x30, [sp, #-176]!
+	mov	x29, sp
+
+	// Saved callee-saved registers and |state|.
+	stp	d8, d9, [sp, #16]
+	stp	d10, d11, [sp, #32]
+	stp	d12, d13, [sp, #48]
+	stp	d14, d15, [sp, #64]
+	stp	x19, x20, [sp, #80]
+	stp	x21, x22, [sp, #96]
+	stp	x23, x24, [sp, #112]
+	stp	x25, x26, [sp, #128]
+	stp	x27, x28, [sp, #144]
+	str	x1, [sp, #160]
+
+	// Load registers from |state|, with the exception of x29. x29 is the
+	// frame pointer and also callee-saved, but AAPCS64 allows platforms to
+	// mandate that x29 always point to a frame. iOS64 does so, which means
+	// we cannot fill x29 with entropy without violating ABI rules
+	// ourselves. x29 is tested separately below.
+	ldp	d8, d9, [x1], #16
+	ldp	d10, d11, [x1], #16
+	ldp	d12, d13, [x1], #16
+	ldp	d14, d15, [x1], #16
+	ldp	x19, x20, [x1], #16
+	ldp	x21, x22, [x1], #16
+	ldp	x23, x24, [x1], #16
+	ldp	x25, x26, [x1], #16
+	ldp	x27, x28, [x1], #16
+
+	// Move parameters into temporary registers.
+	mov	x9, x0
+	mov	x10, x2
+	mov	x11, x3
+
+	// Load parameters into registers.
+	cbz	x11, Largs_done
+	ldr	x0, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x1, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x2, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x3, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x4, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x5, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x6, [x10], #8
+	subs	x11, x11, #1
+	b.eq	Largs_done
+	ldr	x7, [x10], #8
+
+Largs_done:
+	blr	x9
+
+	// Reload |state| and store registers.
+	ldr	x1, [sp, #160]
+	stp	d8, d9, [x1], #16
+	stp	d10, d11, [x1], #16
+	stp	d12, d13, [x1], #16
+	stp	d14, d15, [x1], #16
+	stp	x19, x20, [x1], #16
+	stp	x21, x22, [x1], #16
+	stp	x23, x24, [x1], #16
+	stp	x25, x26, [x1], #16
+	stp	x27, x28, [x1], #16
+
+	// |func| is required to preserve x29, the frame pointer. We cannot load
+	// random values into x29 (see comment above), so compare it against the
+	// expected value and zero the field of |state| if corrupted.
+	mov	x9, sp
+	cmp	x29, x9
+	b.eq	Lx29_ok
+	str	xzr, [x1]
+
+Lx29_ok:
+	// Restore callee-saved registers.
+	ldp	d8, d9, [sp, #16]
+	ldp	d10, d11, [sp, #32]
+	ldp	d12, d13, [sp, #48]
+	ldp	d14, d15, [sp, #64]
+	ldp	x19, x20, [sp, #80]
+	ldp	x21, x22, [sp, #96]
+	ldp	x23, x24, [sp, #112]
+	ldp	x25, x26, [sp, #128]
+	ldp	x27, x28, [sp, #144]
+
+	ldp	x29, x30, [sp], #176
+	AARCH64_VALIDATE_LINK_REGISTER
+	ret
+
+
+.globl	_abi_test_clobber_x0
+.private_extern	_abi_test_clobber_x0
+.align	4
+_abi_test_clobber_x0:
+	AARCH64_VALID_CALL_TARGET
+	mov	x0, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x1
+.private_extern	_abi_test_clobber_x1
+.align	4
+_abi_test_clobber_x1:
+	AARCH64_VALID_CALL_TARGET
+	mov	x1, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x2
+.private_extern	_abi_test_clobber_x2
+.align	4
+_abi_test_clobber_x2:
+	AARCH64_VALID_CALL_TARGET
+	mov	x2, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x3
+.private_extern	_abi_test_clobber_x3
+.align	4
+_abi_test_clobber_x3:
+	AARCH64_VALID_CALL_TARGET
+	mov	x3, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x4
+.private_extern	_abi_test_clobber_x4
+.align	4
+_abi_test_clobber_x4:
+	AARCH64_VALID_CALL_TARGET
+	mov	x4, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x5
+.private_extern	_abi_test_clobber_x5
+.align	4
+_abi_test_clobber_x5:
+	AARCH64_VALID_CALL_TARGET
+	mov	x5, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x6
+.private_extern	_abi_test_clobber_x6
+.align	4
+_abi_test_clobber_x6:
+	AARCH64_VALID_CALL_TARGET
+	mov	x6, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x7
+.private_extern	_abi_test_clobber_x7
+.align	4
+_abi_test_clobber_x7:
+	AARCH64_VALID_CALL_TARGET
+	mov	x7, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x8
+.private_extern	_abi_test_clobber_x8
+.align	4
+_abi_test_clobber_x8:
+	AARCH64_VALID_CALL_TARGET
+	mov	x8, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x9
+.private_extern	_abi_test_clobber_x9
+.align	4
+_abi_test_clobber_x9:
+	AARCH64_VALID_CALL_TARGET
+	mov	x9, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x10
+.private_extern	_abi_test_clobber_x10
+.align	4
+_abi_test_clobber_x10:
+	AARCH64_VALID_CALL_TARGET
+	mov	x10, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x11
+.private_extern	_abi_test_clobber_x11
+.align	4
+_abi_test_clobber_x11:
+	AARCH64_VALID_CALL_TARGET
+	mov	x11, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x12
+.private_extern	_abi_test_clobber_x12
+.align	4
+_abi_test_clobber_x12:
+	AARCH64_VALID_CALL_TARGET
+	mov	x12, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x13
+.private_extern	_abi_test_clobber_x13
+.align	4
+_abi_test_clobber_x13:
+	AARCH64_VALID_CALL_TARGET
+	mov	x13, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x14
+.private_extern	_abi_test_clobber_x14
+.align	4
+_abi_test_clobber_x14:
+	AARCH64_VALID_CALL_TARGET
+	mov	x14, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x15
+.private_extern	_abi_test_clobber_x15
+.align	4
+_abi_test_clobber_x15:
+	AARCH64_VALID_CALL_TARGET
+	mov	x15, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x16
+.private_extern	_abi_test_clobber_x16
+.align	4
+_abi_test_clobber_x16:
+	AARCH64_VALID_CALL_TARGET
+	mov	x16, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x17
+.private_extern	_abi_test_clobber_x17
+.align	4
+_abi_test_clobber_x17:
+	AARCH64_VALID_CALL_TARGET
+	mov	x17, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x19
+.private_extern	_abi_test_clobber_x19
+.align	4
+_abi_test_clobber_x19:
+	AARCH64_VALID_CALL_TARGET
+	mov	x19, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x20
+.private_extern	_abi_test_clobber_x20
+.align	4
+_abi_test_clobber_x20:
+	AARCH64_VALID_CALL_TARGET
+	mov	x20, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x21
+.private_extern	_abi_test_clobber_x21
+.align	4
+_abi_test_clobber_x21:
+	AARCH64_VALID_CALL_TARGET
+	mov	x21, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x22
+.private_extern	_abi_test_clobber_x22
+.align	4
+_abi_test_clobber_x22:
+	AARCH64_VALID_CALL_TARGET
+	mov	x22, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x23
+.private_extern	_abi_test_clobber_x23
+.align	4
+_abi_test_clobber_x23:
+	AARCH64_VALID_CALL_TARGET
+	mov	x23, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x24
+.private_extern	_abi_test_clobber_x24
+.align	4
+_abi_test_clobber_x24:
+	AARCH64_VALID_CALL_TARGET
+	mov	x24, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x25
+.private_extern	_abi_test_clobber_x25
+.align	4
+_abi_test_clobber_x25:
+	AARCH64_VALID_CALL_TARGET
+	mov	x25, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x26
+.private_extern	_abi_test_clobber_x26
+.align	4
+_abi_test_clobber_x26:
+	AARCH64_VALID_CALL_TARGET
+	mov	x26, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x27
+.private_extern	_abi_test_clobber_x27
+.align	4
+_abi_test_clobber_x27:
+	AARCH64_VALID_CALL_TARGET
+	mov	x27, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x28
+.private_extern	_abi_test_clobber_x28
+.align	4
+_abi_test_clobber_x28:
+	AARCH64_VALID_CALL_TARGET
+	mov	x28, xzr
+	ret
+
+
+.globl	_abi_test_clobber_x29
+.private_extern	_abi_test_clobber_x29
+.align	4
+_abi_test_clobber_x29:
+	AARCH64_VALID_CALL_TARGET
+	mov	x29, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d0
+.private_extern	_abi_test_clobber_d0
+.align	4
+_abi_test_clobber_d0:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d0, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d1
+.private_extern	_abi_test_clobber_d1
+.align	4
+_abi_test_clobber_d1:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d1, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d2
+.private_extern	_abi_test_clobber_d2
+.align	4
+_abi_test_clobber_d2:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d2, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d3
+.private_extern	_abi_test_clobber_d3
+.align	4
+_abi_test_clobber_d3:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d3, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d4
+.private_extern	_abi_test_clobber_d4
+.align	4
+_abi_test_clobber_d4:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d4, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d5
+.private_extern	_abi_test_clobber_d5
+.align	4
+_abi_test_clobber_d5:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d5, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d6
+.private_extern	_abi_test_clobber_d6
+.align	4
+_abi_test_clobber_d6:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d6, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d7
+.private_extern	_abi_test_clobber_d7
+.align	4
+_abi_test_clobber_d7:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d7, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d8
+.private_extern	_abi_test_clobber_d8
+.align	4
+_abi_test_clobber_d8:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d8, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d9
+.private_extern	_abi_test_clobber_d9
+.align	4
+_abi_test_clobber_d9:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d9, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d10
+.private_extern	_abi_test_clobber_d10
+.align	4
+_abi_test_clobber_d10:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d10, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d11
+.private_extern	_abi_test_clobber_d11
+.align	4
+_abi_test_clobber_d11:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d11, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d12
+.private_extern	_abi_test_clobber_d12
+.align	4
+_abi_test_clobber_d12:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d12, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d13
+.private_extern	_abi_test_clobber_d13
+.align	4
+_abi_test_clobber_d13:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d13, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d14
+.private_extern	_abi_test_clobber_d14
+.align	4
+_abi_test_clobber_d14:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d14, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d15
+.private_extern	_abi_test_clobber_d15
+.align	4
+_abi_test_clobber_d15:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d15, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d16
+.private_extern	_abi_test_clobber_d16
+.align	4
+_abi_test_clobber_d16:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d16, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d17
+.private_extern	_abi_test_clobber_d17
+.align	4
+_abi_test_clobber_d17:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d17, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d18
+.private_extern	_abi_test_clobber_d18
+.align	4
+_abi_test_clobber_d18:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d18, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d19
+.private_extern	_abi_test_clobber_d19
+.align	4
+_abi_test_clobber_d19:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d19, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d20
+.private_extern	_abi_test_clobber_d20
+.align	4
+_abi_test_clobber_d20:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d20, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d21
+.private_extern	_abi_test_clobber_d21
+.align	4
+_abi_test_clobber_d21:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d21, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d22
+.private_extern	_abi_test_clobber_d22
+.align	4
+_abi_test_clobber_d22:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d22, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d23
+.private_extern	_abi_test_clobber_d23
+.align	4
+_abi_test_clobber_d23:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d23, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d24
+.private_extern	_abi_test_clobber_d24
+.align	4
+_abi_test_clobber_d24:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d24, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d25
+.private_extern	_abi_test_clobber_d25
+.align	4
+_abi_test_clobber_d25:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d25, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d26
+.private_extern	_abi_test_clobber_d26
+.align	4
+_abi_test_clobber_d26:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d26, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d27
+.private_extern	_abi_test_clobber_d27
+.align	4
+_abi_test_clobber_d27:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d27, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d28
+.private_extern	_abi_test_clobber_d28
+.align	4
+_abi_test_clobber_d28:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d28, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d29
+.private_extern	_abi_test_clobber_d29
+.align	4
+_abi_test_clobber_d29:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d29, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d30
+.private_extern	_abi_test_clobber_d30
+.align	4
+_abi_test_clobber_d30:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d30, xzr
+	ret
+
+
+.globl	_abi_test_clobber_d31
+.private_extern	_abi_test_clobber_d31
+.align	4
+_abi_test_clobber_d31:
+	AARCH64_VALID_CALL_TARGET
+	fmov	d31, xzr
+	ret
+
+
+.globl	_abi_test_clobber_v8_upper
+.private_extern	_abi_test_clobber_v8_upper
+.align	4
+_abi_test_clobber_v8_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v8.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v9_upper
+.private_extern	_abi_test_clobber_v9_upper
+.align	4
+_abi_test_clobber_v9_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v9.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v10_upper
+.private_extern	_abi_test_clobber_v10_upper
+.align	4
+_abi_test_clobber_v10_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v10.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v11_upper
+.private_extern	_abi_test_clobber_v11_upper
+.align	4
+_abi_test_clobber_v11_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v11.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v12_upper
+.private_extern	_abi_test_clobber_v12_upper
+.align	4
+_abi_test_clobber_v12_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v12.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v13_upper
+.private_extern	_abi_test_clobber_v13_upper
+.align	4
+_abi_test_clobber_v13_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v13.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v14_upper
+.private_extern	_abi_test_clobber_v14_upper
+.align	4
+_abi_test_clobber_v14_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v14.d[1], xzr
+	ret
+
+
+.globl	_abi_test_clobber_v15_upper
+.private_extern	_abi_test_clobber_v15_upper
+.align	4
+_abi_test_clobber_v15_upper:
+	AARCH64_VALID_CALL_TARGET
+	fmov	v15.d[1], xzr
+	ret
+
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/chacha/chacha-armv4.S b/apple-arm/crypto/chacha/chacha-armv4.S
new file mode 100644
index 0000000..cadf2b6
--- /dev/null
+++ b/apple-arm/crypto/chacha/chacha-armv4.S
@@ -0,0 +1,1498 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb	ldrbhs
+#endif
+
+.align	5
+Lsigma:
+.long	0x61707865,0x3320646e,0x79622d32,0x6b206574	@ endian-neutral
+Lone:
+.long	1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-LChaCha20_ctr32
+#else
+.word	-1
+#endif
+
+.globl	_ChaCha20_ctr32
+.private_extern	_ChaCha20_ctr32
+#ifdef __thumb2__
+.thumb_func	_ChaCha20_ctr32
+#endif
+.align	5
+_ChaCha20_ctr32:
+LChaCha20_ctr32:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r14,pc,#16		@ _ChaCha20_ctr32
+#else
+	adr	r14,LChaCha20_ctr32
+#endif
+	cmp	r2,#0			@ len==0?
+#ifdef	__thumb2__
+	itt	eq
+#endif
+	addeq	sp,sp,#4*3
+	beq	Lno_data
+#if __ARM_MAX_ARCH__>=7
+	cmp	r2,#192			@ test len
+	bls	Lshort
+	ldr	r4,[r14,#-32]
+	ldr	r4,[r14,r4]
+# ifdef	__APPLE__
+	ldr	r4,[r4]
+# endif
+	tst	r4,#ARMV7_NEON
+	bne	LChaCha20_neon
+Lshort:
+#endif
+	ldmia	r12,{r4,r5,r6,r7}		@ load counter and nonce
+	sub	sp,sp,#4*(16)		@ off-load area
+	sub	r14,r14,#64		@ Lsigma
+	stmdb	sp!,{r4,r5,r6,r7}		@ copy counter and nonce
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy key
+	stmdb	sp!,{r0,r1,r2,r3}		@ copy sigma
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	b	Loop_outer_enter
+
+.align	4
+Loop_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	str	r11,[sp,#4*(32+2)]	@ save len
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	str	r14,  [sp,#4*(32+0)]	@ save out
+Loop_outer_enter:
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	b	Loop
+
+.align	4
+Loop:
+	subs	r11,r11,#1
+	add	r0,r0,r4
+	mov	r12,r12,ror#16
+	add	r1,r1,r5
+	mov	r10,r10,ror#16
+	eor	r12,r12,r0,ror#16
+	eor	r10,r10,r1,ror#16
+	add	r8,r8,r12
+	mov	r4,r4,ror#20
+	add	r9,r9,r10
+	mov	r5,r5,ror#20
+	eor	r4,r4,r8,ror#20
+	eor	r5,r5,r9,ror#20
+	add	r0,r0,r4
+	mov	r12,r12,ror#24
+	add	r1,r1,r5
+	mov	r10,r10,ror#24
+	eor	r12,r12,r0,ror#24
+	eor	r10,r10,r1,ror#24
+	add	r8,r8,r12
+	mov	r4,r4,ror#25
+	add	r9,r9,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+13)]
+	ldr	r10,[sp,#4*(16+15)]
+	eor	r4,r4,r8,ror#25
+	eor	r5,r5,r9,ror#25
+	str	r8,[sp,#4*(16+8)]
+	ldr	r8,[sp,#4*(16+10)]
+	add	r2,r2,r6
+	mov	r14,r14,ror#16
+	str	r9,[sp,#4*(16+9)]
+	ldr	r9,[sp,#4*(16+11)]
+	add	r3,r3,r7
+	mov	r10,r10,ror#16
+	eor	r14,r14,r2,ror#16
+	eor	r10,r10,r3,ror#16
+	add	r8,r8,r14
+	mov	r6,r6,ror#20
+	add	r9,r9,r10
+	mov	r7,r7,ror#20
+	eor	r6,r6,r8,ror#20
+	eor	r7,r7,r9,ror#20
+	add	r2,r2,r6
+	mov	r14,r14,ror#24
+	add	r3,r3,r7
+	mov	r10,r10,ror#24
+	eor	r14,r14,r2,ror#24
+	eor	r10,r10,r3,ror#24
+	add	r8,r8,r14
+	mov	r6,r6,ror#25
+	add	r9,r9,r10
+	mov	r7,r7,ror#25
+	eor	r6,r6,r8,ror#25
+	eor	r7,r7,r9,ror#25
+	add	r0,r0,r5
+	mov	r10,r10,ror#16
+	add	r1,r1,r6
+	mov	r12,r12,ror#16
+	eor	r10,r10,r0,ror#16
+	eor	r12,r12,r1,ror#16
+	add	r8,r8,r10
+	mov	r5,r5,ror#20
+	add	r9,r9,r12
+	mov	r6,r6,ror#20
+	eor	r5,r5,r8,ror#20
+	eor	r6,r6,r9,ror#20
+	add	r0,r0,r5
+	mov	r10,r10,ror#24
+	add	r1,r1,r6
+	mov	r12,r12,ror#24
+	eor	r10,r10,r0,ror#24
+	eor	r12,r12,r1,ror#24
+	add	r8,r8,r10
+	mov	r5,r5,ror#25
+	str	r10,[sp,#4*(16+15)]
+	ldr	r10,[sp,#4*(16+13)]
+	add	r9,r9,r12
+	mov	r6,r6,ror#25
+	eor	r5,r5,r8,ror#25
+	eor	r6,r6,r9,ror#25
+	str	r8,[sp,#4*(16+10)]
+	ldr	r8,[sp,#4*(16+8)]
+	add	r2,r2,r7
+	mov	r10,r10,ror#16
+	str	r9,[sp,#4*(16+11)]
+	ldr	r9,[sp,#4*(16+9)]
+	add	r3,r3,r4
+	mov	r14,r14,ror#16
+	eor	r10,r10,r2,ror#16
+	eor	r14,r14,r3,ror#16
+	add	r8,r8,r10
+	mov	r7,r7,ror#20
+	add	r9,r9,r14
+	mov	r4,r4,ror#20
+	eor	r7,r7,r8,ror#20
+	eor	r4,r4,r9,ror#20
+	add	r2,r2,r7
+	mov	r10,r10,ror#24
+	add	r3,r3,r4
+	mov	r14,r14,ror#24
+	eor	r10,r10,r2,ror#24
+	eor	r14,r14,r3,ror#24
+	add	r8,r8,r10
+	mov	r7,r7,ror#25
+	add	r9,r9,r14
+	mov	r4,r4,ror#25
+	eor	r7,r7,r8,ror#25
+	eor	r4,r4,r9,ror#25
+	bne	Loop
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	cmp	r11,#64		@ done yet?
+#ifdef	__thumb2__
+	itete	lo
+#endif
+	addlo	r12,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r12,[sp,#4*(32+1)]	@ ... load inp
+	addlo	r14,sp,#4*(0)		@ shortcut or ...
+	ldrhs	r14,[sp,#4*(32+0)]	@ ... load out
+
+	ldr	r8,[sp,#4*(0)]	@ load key material
+	ldr	r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+	orr	r10,r12,r14
+	tst	r10,#3		@ are input and output aligned?
+	ldr	r10,[sp,#4*(2)]
+	bne	Lunaligned
+	cmp	r11,#64		@ restore flags
+# else
+	ldr	r10,[sp,#4*(2)]
+# endif
+	ldr	r11,[sp,#4*(3)]
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8	@ xor with input
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(4)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r1,[r14,#-12]
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+	add	r8,sp,#4*(8)
+	str	r4,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r5,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r1,r1,r9
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r0,r0,r8
+	eorhs	r1,r1,r9
+	add	r8,sp,#4*(12)
+	str	r0,[r14],#16		@ store output
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r2,r2,r10
+	eorhs	r3,r3,r11
+	str	r1,[r14,#-12]
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r5,r5,r9
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1		@ next counter value
+	strhi	r8,[sp,#4*(12)]	@ save next counter value
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r8,[r12],#16		@ load input
+	ldrhs	r9,[r12,#-12]
+	add	r6,r6,r10
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhs	r10,[r12,#-8]
+	ldrhs	r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r4,r4,r8
+	eorhs	r5,r5,r9
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	eorhs	r6,r6,r10
+	eorhs	r7,r7,r11
+	str	r4,[r14],#16		@ store output
+	str	r5,[r14,#-12]
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64		@ len-=64
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	Loop_outer
+
+	beq	Ldone
+# if __ARM_ARCH__<7
+	b	Ltail
+
+.align	4
+Lunaligned:@ unaligned endian-neutral path
+	cmp	r11,#64		@ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+	ldr	r11,[sp,#4*(3)]
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+0)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r0,sp,#4*(16+8)
+	add	r4,r4,r8		@ accumulate key material
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+	add	r8,sp,#4*(4+4)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}		@ load second half
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]		@ copy "rx"
+	strhi	r11,[sp,#4*(16+11)]		@ copy "rx"
+	add	r0,r0,r8		@ accumulate key material
+	add	r1,r1,r9
+	add	r2,r2,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r3,r3,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r0,r8,r0		@ xor with input (or zero)
+	eor	r1,r9,r1
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r2,r10,r2
+	strb	r0,[r14],#16		@ store output
+	eor	r3,r11,r3
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r1,[r14,#-12]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-8]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r3,[r14,#-4]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-15]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r1,[r14,#-11]
+	eor	r0,r8,r0,lsr#8
+	strb	r2,[r14,#-7]
+	eor	r1,r9,r1,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r3,[r14,#-3]
+	eor	r2,r10,r2,lsr#8
+	strb	r0,[r14,#-14]
+	eor	r3,r11,r3,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r1,[r14,#-10]
+	strb	r2,[r14,#-6]
+	eor	r0,r8,r0,lsr#8
+	strb	r3,[r14,#-2]
+	eor	r1,r9,r1,lsr#8
+	strb	r0,[r14,#-13]
+	eor	r2,r10,r2,lsr#8
+	strb	r1,[r14,#-9]
+	eor	r3,r11,r3,lsr#8
+	strb	r2,[r14,#-5]
+	strb	r3,[r14,#-1]
+	add	r8,sp,#4*(4+8)
+	ldmia	r8,{r8,r9,r10,r11}		@ load key material
+	add	r4,r4,r8		@ accumulate key material
+# ifdef	__thumb2__
+	itt	hi
+# endif
+	addhi	r8,r8,#1			@ next counter value
+	strhi	r8,[sp,#4*(12)]		@ save next counter value
+	add	r5,r5,r9
+	add	r6,r6,r10
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r8,r8,r8		@ zero or ...
+	ldrhsb	r8,[r12],#16			@ ... load input
+	eorlo	r9,r9,r9
+	ldrhsb	r9,[r12,#-12]
+
+	add	r7,r7,r11
+# ifdef	__thumb2__
+	itete	lo
+# endif
+	eorlo	r10,r10,r10
+	ldrhsb	r10,[r12,#-8]
+	eorlo	r11,r11,r11
+	ldrhsb	r11,[r12,#-4]
+
+	eor	r4,r8,r4		@ xor with input (or zero)
+	eor	r5,r9,r5
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-15]		@ load more input
+	ldrhsb	r9,[r12,#-11]
+	eor	r6,r10,r6
+	strb	r4,[r14],#16		@ store output
+	eor	r7,r11,r7
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-7]
+	ldrhsb	r11,[r12,#-3]
+	strb	r5,[r14,#-12]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-8]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-14]		@ load more input
+	ldrhsb	r9,[r12,#-10]
+	strb	r7,[r14,#-4]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-15]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-6]
+	ldrhsb	r11,[r12,#-2]
+	strb	r5,[r14,#-11]
+	eor	r4,r8,r4,lsr#8
+	strb	r6,[r14,#-7]
+	eor	r5,r9,r5,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r8,[r12,#-13]		@ load more input
+	ldrhsb	r9,[r12,#-9]
+	strb	r7,[r14,#-3]
+	eor	r6,r10,r6,lsr#8
+	strb	r4,[r14,#-14]
+	eor	r7,r11,r7,lsr#8
+# ifdef	__thumb2__
+	itt	hs
+# endif
+	ldrhsb	r10,[r12,#-5]
+	ldrhsb	r11,[r12,#-1]
+	strb	r5,[r14,#-10]
+	strb	r6,[r14,#-6]
+	eor	r4,r8,r4,lsr#8
+	strb	r7,[r14,#-2]
+	eor	r5,r9,r5,lsr#8
+	strb	r4,[r14,#-13]
+	eor	r6,r10,r6,lsr#8
+	strb	r5,[r14,#-9]
+	eor	r7,r11,r7,lsr#8
+	strb	r6,[r14,#-5]
+	strb	r7,[r14,#-1]
+# ifdef	__thumb2__
+	it	ne
+# endif
+	ldrne	r8,[sp,#4*(32+2)]		@ re-load len
+# ifdef	__thumb2__
+	it	hs
+# endif
+	subhs	r11,r8,#64			@ len-=64
+	bhi	Loop_outer
+
+	beq	Ldone
+#endif
+
+Ltail:
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	add	r9,sp,#4*(0)
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+Loop_tail:
+	ldrb	r10,[r9],#1	@ read buffer on stack
+	ldrb	r11,[r12],#1		@ read input
+	subs	r8,r8,#1
+	eor	r11,r11,r10
+	strb	r11,[r14],#1		@ store output
+	bne	Loop_tail
+
+Ldone:
+	add	sp,sp,#4*(32+3)
+Lno_data:
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func	ChaCha20_neon
+#endif
+.align	5
+ChaCha20_neon:
+	ldr	r12,[sp,#0]		@ pull pointer to counter and nonce
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+LChaCha20_neon:
+	adr	r14,Lsigma
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI spec says so
+	stmdb	sp!,{r0,r1,r2,r3}
+
+	vld1.32	{q1,q2},[r3]		@ load key
+	ldmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}		@ load key
+
+	sub	sp,sp,#4*(16+16)
+	vld1.32	{q3},[r12]		@ load counter and nonce
+	add	r12,sp,#4*8
+	ldmia	r14,{r0,r1,r2,r3}		@ load sigma
+	vld1.32	{q0},[r14]!		@ load sigma
+	vld1.32	{q12},[r14]		@ one
+	vst1.32	{q2,q3},[r12]		@ copy 1/2key|counter|nonce
+	vst1.32	{q0,q1},[sp]		@ copy sigma|1/2key
+
+	str	r10,[sp,#4*(16+10)]	@ off-load "rx"
+	str	r11,[sp,#4*(16+11)]	@ off-load "rx"
+	vshl.i32	d26,d24,#1	@ two
+	vstr	d24,[sp,#4*(16+0)]
+	vshl.i32	d28,d24,#2	@ four
+	vstr	d26,[sp,#4*(16+2)]
+	vmov	q4,q0
+	vstr	d28,[sp,#4*(16+4)]
+	vmov	q8,q0
+	vmov	q5,q1
+	vmov	q9,q1
+	b	Loop_neon_enter
+
+.align	4
+Loop_neon_outer:
+	ldmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9}		@ load key material
+	cmp	r11,#64*2		@ if len<=64*2
+	bls	Lbreak_neon		@ switch to integer-only
+	vmov	q4,q0
+	str	r11,[sp,#4*(32+2)]	@ save len
+	vmov	q8,q0
+	str	r12,  [sp,#4*(32+1)]	@ save inp
+	vmov	q5,q1
+	str	r14,  [sp,#4*(32+0)]	@ save out
+	vmov	q9,q1
+Loop_neon_enter:
+	ldr	r11, [sp,#4*(15)]
+	vadd.i32	q7,q3,q12		@ counter+1
+	ldr	r12,[sp,#4*(12)]	@ modulo-scheduled load
+	vmov	q6,q2
+	ldr	r10, [sp,#4*(13)]
+	vmov	q10,q2
+	ldr	r14,[sp,#4*(14)]
+	vadd.i32	q11,q7,q12		@ counter+2
+	str	r11, [sp,#4*(16+15)]
+	mov	r11,#10
+	add	r12,r12,#3	@ counter+3
+	b	Loop_neon
+
+.align	4
+Loop_neon:
+	subs	r11,r11,#1
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r4
+	vadd.i32	q4,q4,q5
+	mov	r12,r12,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r5
+	veor	q3,q3,q0
+	mov	r10,r10,ror#16
+	veor	q7,q7,q4
+	eor	r12,r12,r0,ror#16
+	veor	q11,q11,q8
+	eor	r10,r10,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r12
+	vrev32.16	q7,q7
+	mov	r4,r4,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r10
+	vadd.i32	q2,q2,q3
+	mov	r5,r5,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r4,r4,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r5,r5,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r4
+	veor	q13,q5,q6
+	mov	r12,r12,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r5
+	vshr.u32	q1,q12,#20
+	mov	r10,r10,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r12,r12,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r10,r10,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r12
+	vsli.32	q5,q13,#12
+	mov	r4,r4,ror#25
+	vsli.32	q9,q14,#12
+	add	r9,r9,r10
+	vadd.i32	q0,q0,q1
+	mov	r5,r5,ror#25
+	vadd.i32	q4,q4,q5
+	str	r10,[sp,#4*(16+13)]
+	vadd.i32	q8,q8,q9
+	ldr	r10,[sp,#4*(16+15)]
+	veor	q12,q3,q0
+	eor	r4,r4,r8,ror#25
+	veor	q13,q7,q4
+	eor	r5,r5,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+8)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+10)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r6
+	vshr.u32	q11,q14,#24
+	mov	r14,r14,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+9)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+11)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r7
+	vadd.i32	q2,q2,q3
+	mov	r10,r10,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r14,r14,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r10,r10,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r14
+	veor	q13,q5,q6
+	mov	r6,r6,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r10
+	vshr.u32	q1,q12,#25
+	mov	r7,r7,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r6,r6,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r7,r7,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r6
+	vsli.32	q5,q13,#7
+	mov	r14,r14,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r7
+	vext.8	q2,q2,q2,#8
+	mov	r10,r10,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r14,r14,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r10,r10,r3,ror#24
+	vext.8	q1,q1,q1,#4
+	add	r8,r8,r14
+	vext.8	q5,q5,q5,#4
+	mov	r6,r6,ror#25
+	vext.8	q9,q9,q9,#4
+	add	r9,r9,r10
+	vext.8	q3,q3,q3,#12
+	mov	r7,r7,ror#25
+	vext.8	q7,q7,q7,#12
+	eor	r6,r6,r8,ror#25
+	vext.8	q11,q11,q11,#12
+	eor	r7,r7,r9,ror#25
+	vadd.i32	q0,q0,q1
+	add	r0,r0,r5
+	vadd.i32	q4,q4,q5
+	mov	r10,r10,ror#16
+	vadd.i32	q8,q8,q9
+	add	r1,r1,r6
+	veor	q3,q3,q0
+	mov	r12,r12,ror#16
+	veor	q7,q7,q4
+	eor	r10,r10,r0,ror#16
+	veor	q11,q11,q8
+	eor	r12,r12,r1,ror#16
+	vrev32.16	q3,q3
+	add	r8,r8,r10
+	vrev32.16	q7,q7
+	mov	r5,r5,ror#20
+	vrev32.16	q11,q11
+	add	r9,r9,r12
+	vadd.i32	q2,q2,q3
+	mov	r6,r6,ror#20
+	vadd.i32	q6,q6,q7
+	eor	r5,r5,r8,ror#20
+	vadd.i32	q10,q10,q11
+	eor	r6,r6,r9,ror#20
+	veor	q12,q1,q2
+	add	r0,r0,r5
+	veor	q13,q5,q6
+	mov	r10,r10,ror#24
+	veor	q14,q9,q10
+	add	r1,r1,r6
+	vshr.u32	q1,q12,#20
+	mov	r12,r12,ror#24
+	vshr.u32	q5,q13,#20
+	eor	r10,r10,r0,ror#24
+	vshr.u32	q9,q14,#20
+	eor	r12,r12,r1,ror#24
+	vsli.32	q1,q12,#12
+	add	r8,r8,r10
+	vsli.32	q5,q13,#12
+	mov	r5,r5,ror#25
+	vsli.32	q9,q14,#12
+	str	r10,[sp,#4*(16+15)]
+	vadd.i32	q0,q0,q1
+	ldr	r10,[sp,#4*(16+13)]
+	vadd.i32	q4,q4,q5
+	add	r9,r9,r12
+	vadd.i32	q8,q8,q9
+	mov	r6,r6,ror#25
+	veor	q12,q3,q0
+	eor	r5,r5,r8,ror#25
+	veor	q13,q7,q4
+	eor	r6,r6,r9,ror#25
+	veor	q14,q11,q8
+	str	r8,[sp,#4*(16+10)]
+	vshr.u32	q3,q12,#24
+	ldr	r8,[sp,#4*(16+8)]
+	vshr.u32	q7,q13,#24
+	add	r2,r2,r7
+	vshr.u32	q11,q14,#24
+	mov	r10,r10,ror#16
+	vsli.32	q3,q12,#8
+	str	r9,[sp,#4*(16+11)]
+	vsli.32	q7,q13,#8
+	ldr	r9,[sp,#4*(16+9)]
+	vsli.32	q11,q14,#8
+	add	r3,r3,r4
+	vadd.i32	q2,q2,q3
+	mov	r14,r14,ror#16
+	vadd.i32	q6,q6,q7
+	eor	r10,r10,r2,ror#16
+	vadd.i32	q10,q10,q11
+	eor	r14,r14,r3,ror#16
+	veor	q12,q1,q2
+	add	r8,r8,r10
+	veor	q13,q5,q6
+	mov	r7,r7,ror#20
+	veor	q14,q9,q10
+	add	r9,r9,r14
+	vshr.u32	q1,q12,#25
+	mov	r4,r4,ror#20
+	vshr.u32	q5,q13,#25
+	eor	r7,r7,r8,ror#20
+	vshr.u32	q9,q14,#25
+	eor	r4,r4,r9,ror#20
+	vsli.32	q1,q12,#7
+	add	r2,r2,r7
+	vsli.32	q5,q13,#7
+	mov	r10,r10,ror#24
+	vsli.32	q9,q14,#7
+	add	r3,r3,r4
+	vext.8	q2,q2,q2,#8
+	mov	r14,r14,ror#24
+	vext.8	q6,q6,q6,#8
+	eor	r10,r10,r2,ror#24
+	vext.8	q10,q10,q10,#8
+	eor	r14,r14,r3,ror#24
+	vext.8	q1,q1,q1,#12
+	add	r8,r8,r10
+	vext.8	q5,q5,q5,#12
+	mov	r7,r7,ror#25
+	vext.8	q9,q9,q9,#12
+	add	r9,r9,r14
+	vext.8	q3,q3,q3,#4
+	mov	r4,r4,ror#25
+	vext.8	q7,q7,q7,#4
+	eor	r7,r7,r8,ror#25
+	vext.8	q11,q11,q11,#4
+	eor	r4,r4,r9,ror#25
+	bne	Loop_neon
+
+	add	r11,sp,#32
+	vld1.32	{q12,q13},[sp]		@ load key material
+	vld1.32	{q14,q15},[r11]
+
+	ldr	r11,[sp,#4*(32+2)]	@ load len
+
+	str	r8, [sp,#4*(16+8)]	@ modulo-scheduled store
+	str	r9, [sp,#4*(16+9)]
+	str	r12,[sp,#4*(16+12)]
+	str	r10, [sp,#4*(16+13)]
+	str	r14,[sp,#4*(16+14)]
+
+	@ at this point we have first half of 512-bit result in
+	@ rx and second half at sp+4*(16+8)
+
+	ldr	r12,[sp,#4*(32+1)]	@ load inp
+	ldr	r14,[sp,#4*(32+0)]	@ load out
+
+	vadd.i32	q0,q0,q12		@ accumulate key material
+	vadd.i32	q4,q4,q12
+	vadd.i32	q8,q8,q12
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	vadd.i32	q1,q1,q13
+	vadd.i32	q5,q5,q13
+	vadd.i32	q9,q9,q13
+	vldr	d26,[sp,#4*(16+2)]	@ two
+
+	vadd.i32	q2,q2,q14
+	vadd.i32	q6,q6,q14
+	vadd.i32	q10,q10,q14
+	vadd.i32	d14,d14,d24	@ counter+1
+	vadd.i32	d22,d22,d26	@ counter+2
+
+	vadd.i32	q3,q3,q15
+	vadd.i32	q7,q7,q15
+	vadd.i32	q11,q11,q15
+
+	cmp	r11,#64*4
+	blo	Ltail_neon
+
+	vld1.8	{q12,q13},[r12]!	@ load input
+	mov	r11,sp
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12		@ xor with input
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	vst1.8	{q0,q1},[r14]!	@ store output
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vld1.32	{q0,q1},[r11]!	@ load for next iteration
+	veor	d25,d25,d25
+	vldr	d24,[sp,#4*(16+4)]	@ four
+	veor	q9,q9,q13
+	vld1.32	{q2,q3},[r11]
+	veor	q10,q10,q14
+	vst1.8	{q4,q5},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q6,q7},[r14]!
+
+	vadd.i32	d6,d6,d24	@ next counter value
+	vldr	d24,[sp,#4*(16+0)]	@ one
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	vst1.8	{q8,q9},[r14]!
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+	vst1.8	{q10,q11},[r14]!
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8	@ xor with input
+	add	r8,sp,#4*(4)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r5,r5,r9
+	ldr	r9,[r12,#-12]
+	add	r6,r6,r10
+	ldr	r10,[r12,#-8]
+	add	r7,r7,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+	add	r8,sp,#4*(8)
+	eor	r5,r5,r9
+	str	r4,[r14],#16		@ store output
+	eor	r6,r6,r10
+	str	r5,[r14,#-12]
+	eor	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r6,[r14,#-8]
+	add	r0,sp,#4*(16+8)
+	str	r7,[r14,#-4]
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	ldr	r8,[r12],#16		@ load input
+	add	r1,r1,r9
+	ldr	r9,[r12,#-12]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r10,[sp,#4*(16+10)]	@ copy "rx" while at it
+	add	r2,r2,r10
+	ldr	r10,[r12,#-8]
+# ifdef	__thumb2__
+	it	hi
+# endif
+	strhi	r11,[sp,#4*(16+11)]	@ copy "rx" while at it
+	add	r3,r3,r11
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+# endif
+	eor	r0,r0,r8
+	add	r8,sp,#4*(12)
+	eor	r1,r1,r9
+	str	r0,[r14],#16		@ store output
+	eor	r2,r2,r10
+	str	r1,[r14,#-12]
+	eor	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+	str	r2,[r14,#-8]
+	str	r3,[r14,#-4]
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,r8,#4		@ next counter value
+	add	r5,r5,r9
+	str	r8,[sp,#4*(12)]	@ save next counter value
+	ldr	r8,[r12],#16		@ load input
+	add	r6,r6,r10
+	add	r4,r4,#3		@ counter+3
+	ldr	r9,[r12,#-12]
+	add	r7,r7,r11
+	ldr	r10,[r12,#-8]
+	ldr	r11,[r12,#-4]
+# ifdef	__ARMEB__
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	eor	r4,r4,r8
+# ifdef	__thumb2__
+	it	hi
+# endif
+	ldrhi	r8,[sp,#4*(32+2)]	@ re-load len
+	eor	r5,r5,r9
+	eor	r6,r6,r10
+	str	r4,[r14],#16		@ store output
+	eor	r7,r7,r11
+	str	r5,[r14,#-12]
+	sub	r11,r8,#64*4	@ len-=64*4
+	str	r6,[r14,#-8]
+	str	r7,[r14,#-4]
+	bhi	Loop_neon_outer
+
+	b	Ldone_neon
+
+.align	4
+Lbreak_neon:
+	@ harmonize NEON and integer-only stack frames: load data
+	@ from NEON frame, but save to integer-only one; distance
+	@ between the two is 4*(32+4+16-32)=4*(20).
+
+	str	r11, [sp,#4*(20+32+2)]	@ save len
+	add	r11,sp,#4*(32+4)
+	str	r12,   [sp,#4*(20+32+1)]	@ save inp
+	str	r14,   [sp,#4*(20+32+0)]	@ save out
+
+	ldr	r12,[sp,#4*(16+10)]
+	ldr	r14,[sp,#4*(16+11)]
+	vldmia	r11,{d8,d9,d10,d11,d12,d13,d14,d15}			@ fulfill ABI requirement
+	str	r12,[sp,#4*(20+16+10)]	@ copy "rx"
+	str	r14,[sp,#4*(20+16+11)]	@ copy "rx"
+
+	ldr	r11, [sp,#4*(15)]
+	ldr	r12,[sp,#4*(12)]		@ modulo-scheduled load
+	ldr	r10, [sp,#4*(13)]
+	ldr	r14,[sp,#4*(14)]
+	str	r11, [sp,#4*(20+16+15)]
+	add	r11,sp,#4*(20)
+	vst1.32	{q0,q1},[r11]!		@ copy key
+	add	sp,sp,#4*(20)			@ switch frame
+	vst1.32	{q2,q3},[r11]
+	mov	r11,#10
+	b	Loop				@ go integer-only
+
+.align	4
+Ltail_neon:
+	cmp	r11,#64*3
+	bhs	L192_or_more_neon
+	cmp	r11,#64*2
+	bhs	L128_or_more_neon
+	cmp	r11,#64*1
+	bhs	L64_or_more_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q0,q1},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q2,q3},[r8]
+	b	Loop_tail_neon
+
+.align	4
+L64_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vst1.8	{q0,q1},[r14]!
+	vst1.8	{q2,q3},[r14]!
+
+	beq	Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q4,q5},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q6,q7},[r8]
+	sub	r11,r11,#64*1	@ len-=64*1
+	b	Loop_tail_neon
+
+.align	4
+L128_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vst1.8	{q0,q1},[r14]!
+	veor	q6,q6,q14
+	vst1.8	{q2,q3},[r14]!
+	veor	q7,q7,q15
+	vst1.8	{q4,q5},[r14]!
+	vst1.8	{q6,q7},[r14]!
+
+	beq	Ldone_neon
+
+	add	r8,sp,#4*(8)
+	vst1.8	{q8,q9},[sp]
+	add	r10,sp,#4*(0)
+	vst1.8	{q10,q11},[r8]
+	sub	r11,r11,#64*2	@ len-=64*2
+	b	Loop_tail_neon
+
+.align	4
+L192_or_more_neon:
+	vld1.8	{q12,q13},[r12]!
+	vld1.8	{q14,q15},[r12]!
+	veor	q0,q0,q12
+	veor	q1,q1,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q2,q2,q14
+	veor	q3,q3,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q4,q4,q12
+	veor	q5,q5,q13
+	vld1.8	{q12,q13},[r12]!
+	veor	q6,q6,q14
+	vst1.8	{q0,q1},[r14]!
+	veor	q7,q7,q15
+	vld1.8	{q14,q15},[r12]!
+
+	veor	q8,q8,q12
+	vst1.8	{q2,q3},[r14]!
+	veor	q9,q9,q13
+	vst1.8	{q4,q5},[r14]!
+	veor	q10,q10,q14
+	vst1.8	{q6,q7},[r14]!
+	veor	q11,q11,q15
+	vst1.8	{q8,q9},[r14]!
+	vst1.8	{q10,q11},[r14]!
+
+	beq	Ldone_neon
+
+	ldmia	sp,{r8,r9,r10,r11}	@ load key material
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(4)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r0,sp,#4*(16+8)
+
+	ldmia	r0,{r0,r1,r2,r3,r4,r5,r6,r7}	@ load second half
+
+	add	r0,r0,r8	@ accumulate key material
+	add	r8,sp,#4*(12)
+	add	r1,r1,r9
+	add	r2,r2,r10
+	add	r3,r3,r11
+	ldmia	r8,{r8,r9,r10,r11}	@ load key material
+
+	add	r4,r4,r8	@ accumulate key material
+	add	r8,sp,#4*(8)
+	add	r5,r5,r9
+	add	r4,r4,#3		@ counter+3
+	add	r6,r6,r10
+	add	r7,r7,r11
+	ldr	r11,[sp,#4*(32+2)]	@ re-load len
+# ifdef	__ARMEB__
+	rev	r0,r0
+	rev	r1,r1
+	rev	r2,r2
+	rev	r3,r3
+	rev	r4,r4
+	rev	r5,r5
+	rev	r6,r6
+	rev	r7,r7
+# endif
+	stmia	r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+	add	r10,sp,#4*(0)
+	sub	r11,r11,#64*3	@ len-=64*3
+
+Loop_tail_neon:
+	ldrb	r8,[r10],#1	@ read buffer on stack
+	ldrb	r9,[r12],#1		@ read input
+	subs	r11,r11,#1
+	eor	r8,r8,r9
+	strb	r8,[r14],#1		@ store output
+	bne	Loop_tail_neon
+
+Ldone_neon:
+	add	sp,sp,#4*(32+4)
+	vldmia	sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+	add	sp,sp,#4*(16+3)
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/aesv8-armx32.S b/apple-arm/crypto/fipsmodule/aesv8-armx32.S
new file mode 100644
index 0000000..87b4b0a
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/aesv8-armx32.S
@@ -0,0 +1,809 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+
+.code	32
+#undef	__thumb2__
+.align	5
+Lrcon:
+.long	0x01,0x01,0x01,0x01
+.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
+.long	0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl	_aes_hw_set_encrypt_key
+.private_extern	_aes_hw_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func	_aes_hw_set_encrypt_key
+#endif
+.align	5
+_aes_hw_set_encrypt_key:
+Lenc_key:
+	mov	r3,#-1
+	cmp	r0,#0
+	beq	Lenc_key_abort
+	cmp	r2,#0
+	beq	Lenc_key_abort
+	mov	r3,#-2
+	cmp	r1,#128
+	blt	Lenc_key_abort
+	cmp	r1,#256
+	bgt	Lenc_key_abort
+	tst	r1,#0x3f
+	bne	Lenc_key_abort
+
+	adr	r3,Lrcon
+	cmp	r1,#192
+
+	veor	q0,q0,q0
+	vld1.8	{q3},[r0]!
+	mov	r1,#8		@ reuse r1
+	vld1.32	{q1,q2},[r3]!
+
+	blt	Loop128
+	beq	L192
+	b	L256
+
+.align	4
+Loop128:
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	bne	Loop128
+
+	vld1.32	{q1},[r3]
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+
+	vtbl.8	d20,{q3},d4
+	vtbl.8	d21,{q3},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q3},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]
+	add	r2,r2,#0x50
+
+	mov	r12,#10
+	b	Ldone
+
+.align	4
+L192:
+	vld1.8	{d16},[r0]!
+	vmov.i8	q10,#8			@ borrow q10
+	vst1.32	{q3},[r2]!
+	vsub.i8	q2,q2,q10	@ adjust the mask
+
+Loop192:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{d16},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+
+	vdup.32	q9,d7[1]
+	veor	q9,q9,q8
+	veor	q10,q10,q1
+	vext.8	q8,q0,q8,#12
+	vshl.u8	q1,q1,#1
+	veor	q8,q8,q9
+	veor	q3,q3,q10
+	veor	q8,q8,q10
+	vst1.32	{q3},[r2]!
+	bne	Loop192
+
+	mov	r12,#12
+	add	r2,r2,#0x20
+	b	Ldone
+
+.align	4
+L256:
+	vld1.8	{q8},[r0]
+	mov	r1,#7
+	mov	r12,#14
+	vst1.32	{q3},[r2]!
+
+Loop256:
+	vtbl.8	d20,{q8},d4
+	vtbl.8	d21,{q8},d5
+	vext.8	q9,q0,q3,#12
+	vst1.32	{q8},[r2]!
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+	subs	r1,r1,#1
+
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q3,q3,q9
+	vext.8	q9,q0,q9,#12
+	veor	q10,q10,q1
+	veor	q3,q3,q9
+	vshl.u8	q1,q1,#1
+	veor	q3,q3,q10
+	vst1.32	{q3},[r2]!
+	beq	Ldone
+
+	vdup.32	q10,d7[1]
+	vext.8	q9,q0,q8,#12
+.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
+
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+	vext.8	q9,q0,q9,#12
+	veor	q8,q8,q9
+
+	veor	q8,q8,q10
+	b	Loop256
+
+Ldone:
+	str	r12,[r2]
+	mov	r3,#0
+
+Lenc_key_abort:
+	mov	r0,r3			@ return value
+
+	bx	lr
+
+
+.globl	_aes_hw_set_decrypt_key
+.private_extern	_aes_hw_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func	_aes_hw_set_decrypt_key
+#endif
+.align	5
+_aes_hw_set_decrypt_key:
+	stmdb	sp!,{r4,lr}
+	bl	Lenc_key
+
+	cmp	r0,#0
+	bne	Ldec_key_abort
+
+	sub	r2,r2,#240		@ restore original r2
+	mov	r4,#-16
+	add	r0,r2,r12,lsl#4	@ end of key schedule
+
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+
+Loop_imc:
+	vld1.32	{q0},[r2]
+	vld1.32	{q1},[r0]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+	vst1.32	{q0},[r0],r4
+	vst1.32	{q1},[r2]!
+	cmp	r0,r2
+	bhi	Loop_imc
+
+	vld1.32	{q0},[r2]
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+	vst1.32	{q0},[r0]
+
+	eor	r0,r0,r0		@ return value
+Ldec_key_abort:
+	ldmia	sp!,{r4,pc}
+
+.globl	_aes_hw_encrypt
+.private_extern	_aes_hw_encrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_encrypt
+#endif
+.align	5
+_aes_hw_encrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+Loop_enc:
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	Loop_enc
+
+.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
+.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+
+.globl	_aes_hw_decrypt
+.private_extern	_aes_hw_decrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_decrypt
+#endif
+.align	5
+_aes_hw_decrypt:
+	AARCH64_VALID_CALL_TARGET
+	ldr	r3,[r2,#240]
+	vld1.32	{q0},[r2]!
+	vld1.8	{q2},[r0]
+	sub	r3,r3,#2
+	vld1.32	{q1},[r2]!
+
+Loop_dec:
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]!
+	subs	r3,r3,#2
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q1},[r2]!
+	bgt	Loop_dec
+
+.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
+.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
+	vld1.32	{q0},[r2]
+.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
+	veor	q2,q2,q0
+
+	vst1.8	{q2},[r1]
+	bx	lr
+
+.globl	_aes_hw_cbc_encrypt
+.private_extern	_aes_hw_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func	_aes_hw_cbc_encrypt
+#endif
+.align	5
+_aes_hw_cbc_encrypt:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load remaining args
+	subs	r2,r2,#16
+	mov	r8,#16
+	blo	Lcbc_abort
+	moveq	r8,#0
+
+	cmp	r5,#0			@ en- or decrypting?
+	ldr	r5,[r3,#240]
+	and	r2,r2,#-16
+	vld1.8	{q6},[r4]
+	vld1.8	{q0},[r0],r8
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#6
+	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
+	sub	r5,r5,#2
+	vld1.32	{q10,q11},[r7]!
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+
+	add	r7,r3,#32
+	mov	r6,r5
+	beq	Lcbc_dec
+
+	cmp	r5,#2
+	veor	q0,q0,q6
+	veor	q5,q8,q7
+	beq	Lcbc_enc128
+
+	vld1.32	{q2,q3},[r7]
+	add	r7,r3,#16
+	add	r6,r3,#16*4
+	add	r12,r3,#16*5
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	add	r14,r3,#16*6
+	add	r3,r3,#16*7
+	b	Lenter_cbc_enc
+
+.align	4
+Loop_cbc_enc:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+Lenter_cbc_enc:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r6]
+	cmp	r5,#4
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r12]
+	beq	Lcbc_enc192
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q8},[r14]
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r3]
+	nop
+
+Lcbc_enc192:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.32	{q9},[r7]		@ re-pre-load rndkey[1]
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	Loop_cbc_enc
+
+	vst1.8	{q6},[r1]!
+	b	Lcbc_done
+
+.align	5
+Lcbc_enc128:
+	vld1.32	{q2,q3},[r7]
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	b	Lenter_cbc_enc128
+Loop_cbc_enc128:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vst1.8	{q6},[r1]!
+Lenter_cbc_enc128:
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	subs	r2,r2,#16
+.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	moveq	r8,#0
+.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	vld1.8	{q8},[r0],r8
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+	veor	q8,q8,q5
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+	veor	q6,q0,q7
+	bhs	Loop_cbc_enc128
+
+	vst1.8	{q6},[r1]!
+	b	Lcbc_done
+.align	5
+Lcbc_dec:
+	vld1.8	{q10},[r0]!
+	subs	r2,r2,#32		@ bias
+	add	r6,r5,#2
+	vorr	q3,q0,q0
+	vorr	q1,q0,q0
+	vorr	q11,q10,q10
+	blo	Lcbc_dec_tail
+
+	vorr	q1,q10,q10
+	vld1.8	{q10},[r0]!
+	vorr	q2,q0,q0
+	vorr	q3,q1,q1
+	vorr	q11,q10,q10
+
+Loop3x_cbc_dec:
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Loop3x_cbc_dec
+
+.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q4,q6,q7
+	subs	r2,r2,#0x30
+	veor	q5,q2,q7
+	movlo	r6,r2			@ r6, r6, is zero at this point
+.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+	add	r0,r0,r6		@ r0 is adjusted in such way that
+					@ at exit from the loop q1-q10
+					@ are loaded with last "words"
+	vorr	q6,q11,q11
+	mov	r7,r3
+.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q2},[r0]!
+.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q3},[r0]!
+.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
+.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.8	{q11},[r0]!
+.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	add	r6,r5,#2
+	veor	q4,q4,q0
+	veor	q5,q5,q1
+	veor	q10,q10,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q4},[r1]!
+	vorr	q0,q2,q2
+	vst1.8	{q5},[r1]!
+	vorr	q1,q3,q3
+	vst1.8	{q10},[r1]!
+	vorr	q10,q11,q11
+	bhs	Loop3x_cbc_dec
+
+	cmn	r2,#0x30
+	beq	Lcbc_done
+	nop
+
+Lcbc_dec_tail:
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Lcbc_dec_tail
+
+.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	cmn	r2,#0x20
+.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q5,q6,q7
+.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
+.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
+.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
+.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
+	veor	q9,q3,q7
+.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
+.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
+	beq	Lcbc_dec_one
+	veor	q5,q5,q1
+	veor	q9,q9,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+	vst1.8	{q9},[r1]!
+	b	Lcbc_done
+
+Lcbc_dec_one:
+	veor	q5,q5,q10
+	vorr	q6,q11,q11
+	vst1.8	{q5},[r1]!
+
+Lcbc_done:
+	vst1.8	{q6},[r4]
+Lcbc_abort:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,pc}
+
+.globl	_aes_hw_ctr32_encrypt_blocks
+.private_extern	_aes_hw_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func	_aes_hw_ctr32_encrypt_blocks
+#endif
+.align	5
+_aes_hw_ctr32_encrypt_blocks:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}            @ ABI specification says so
+	ldr	r4, [ip]		@ load remaining arg
+	ldr	r5,[r3,#240]
+
+	ldr	r8, [r4, #12]
+	vld1.32	{q0},[r4]
+
+	vld1.32	{q8,q9},[r3]		@ load key schedule...
+	sub	r5,r5,#4
+	mov	r12,#16
+	cmp	r2,#2
+	add	r7,r3,r5,lsl#4	@ pointer to last 5 round keys
+	sub	r5,r5,#2
+	vld1.32	{q12,q13},[r7]!
+	vld1.32	{q14,q15},[r7]!
+	vld1.32	{q7},[r7]
+	add	r7,r3,#32
+	mov	r6,r5
+	movlo	r12,#0
+
+	@ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+	@ affected by silicon errata #1742098 [0] and #1655431 [1],
+	@ respectively, where the second instruction of an aese/aesmc
+	@ instruction pair may execute twice if an interrupt is taken right
+	@ after the first instruction consumes an input register of which a
+	@ single 32-bit lane has been updated the last time it was modified.
+	@ 
+	@ This function uses a counter in one 32-bit lane. The 
+	@ could write to q1 and q10 directly, but that trips this bugs.
+	@ We write to q6 and copy to the final register as a workaround.
+	@ 
+	@ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+	@ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+	rev	r8, r8
+#endif
+	add	r10, r8, #1
+	vorr	q6,q0,q0
+	rev	r10, r10
+	vmov.32	d13[1],r10
+	add	r8, r8, #2
+	vorr	q1,q6,q6
+	bls	Lctr32_tail
+	rev	r12, r8
+	vmov.32	d13[1],r12
+	sub	r2,r2,#3		@ bias
+	vorr	q10,q6,q6
+	b	Loop3x_ctr32
+
+.align	4
+Loop3x_ctr32:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.32	{q9},[r7]!
+	bgt	Loop3x_ctr32
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
+	vld1.8	{q2},[r0]!
+	add	r9,r8,#1
+.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
+.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
+	vld1.8	{q3},[r0]!
+	rev	r9,r9
+.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vld1.8	{q11},[r0]!
+	mov	r7,r3
+.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
+.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
+.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	veor	q2,q2,q7
+	add	r10,r8,#2
+.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	veor	q3,q3,q7
+	add	r8,r8,#3
+.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	 @ Note the logic to update q0, q1, and q1 is written to work
+	 @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+	 @ 32-bit mode. See the comment above.
+	veor	q11,q11,q7
+	vmov.32	d13[1], r9
+.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q0,q6,q6
+	rev	r10,r10
+.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
+.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
+	vmov.32	d13[1], r10
+	rev	r12,r8
+.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
+.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
+	vorr	q1,q6,q6
+	vmov.32	d13[1], r12
+.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
+.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
+	vorr	q10,q6,q6
+	subs	r2,r2,#3
+.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
+.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
+.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
+
+	veor	q2,q2,q4
+	vld1.32	{q8},[r7]!	@ re-pre-load rndkey[0]
+	vst1.8	{q2},[r1]!
+	veor	q3,q3,q5
+	mov	r6,r5
+	vst1.8	{q3},[r1]!
+	veor	q11,q11,q9
+	vld1.32	{q9},[r7]!	@ re-pre-load rndkey[1]
+	vst1.8	{q11},[r1]!
+	bhs	Loop3x_ctr32
+
+	adds	r2,r2,#3
+	beq	Lctr32_done
+	cmp	r2,#1
+	mov	r12,#16
+	moveq	r12,#0
+
+Lctr32_tail:
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q8},[r7]!
+	subs	r6,r6,#2
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.32	{q9},[r7]!
+	bgt	Lctr32_tail
+
+.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q2},[r0],r12
+.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	vld1.8	{q3},[r0]
+.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q2,q2,q7
+.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
+.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
+.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
+.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
+	veor	q3,q3,q7
+.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
+.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
+
+	cmp	r2,#1
+	veor	q2,q2,q0
+	veor	q3,q3,q1
+	vst1.8	{q2},[r1]!
+	beq	Lctr32_done
+	vst1.8	{q3},[r1]
+
+Lctr32_done:
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/armv4-mont.S b/apple-arm/crypto/fipsmodule/armv4-mont.S
new file mode 100644
index 0000000..e549d1f
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/armv4-mont.S
@@ -0,0 +1,982 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align	5
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-Lbn_mul_mont
+#endif
+
+.globl	_bn_mul_mont
+.private_extern	_bn_mul_mont
+#ifdef __thumb2__
+.thumb_func	_bn_mul_mont
+#endif
+
+.align	5
+_bn_mul_mont:
+Lbn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+	tst	ip,#7
+	bne	Lialu
+	adr	r0,Lbn_mul_mont
+	ldr	r2,LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+#ifdef	__APPLE__
+	ldr	r0,[r0]
+#endif
+	tst	r0,#ARMV7_NEON		@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+Lialu:
+#endif
+	cmp	ip,#2
+	mov	r0,ip			@ load num
+#ifdef	__thumb2__
+	ittt	lt
+#endif
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	Labrt
+
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
+
+	mov	r0,r0,lsl#2		@ rescale r0 for byte count
+	sub	sp,sp,r0		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	r0,r0,#4		@ "num=num-1"
+	add	r4,r2,r0		@ &bp[num-1]
+
+	add	r0,sp,r0		@ r0 to point at &tp[num-1]
+	ldr	r8,[r0,#14*4]		@ &n0
+	ldr	r2,[r2]		@ bp[0]
+	ldr	r5,[r1],#4		@ ap[0],ap++
+	ldr	r6,[r3],#4		@ np[0],np++
+	ldr	r8,[r8]		@ *n0
+	str	r4,[r0,#15*4]		@ save &bp[num]
+
+	umull	r10,r11,r5,r2	@ ap[0]*bp[0]
+	str	r8,[r0,#14*4]		@ save n0 value
+	mul	r8,r10,r8		@ "tp[0]"*n0
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"t[0]"
+	mov	r4,sp
+
+L1st:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	mov	r10,r11
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[0]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	L1st
+
+	adds	r12,r12,r11
+	ldr	r4,[r0,#13*4]		@ restore bp
+	mov	r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	mov	r7,sp
+	str	r14,[r0,#4]		@ tp[num]=
+
+Louter:
+	sub	r7,r0,r7		@ "original" r0-1 value
+	sub	r1,r1,r7		@ "rewind" ap to &ap[1]
+	ldr	r2,[r4,#4]!		@ *(++bp)
+	sub	r3,r3,r7		@ "rewind" np to &np[1]
+	ldr	r5,[r1,#-4]		@ ap[0]
+	ldr	r10,[sp]		@ tp[0]
+	ldr	r6,[r3,#-4]		@ np[0]
+	ldr	r7,[sp,#4]		@ tp[1]
+
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[0]*bp[i]+tp[0]
+	str	r4,[r0,#13*4]		@ save bp
+	mul	r8,r10,r8
+	mov	r12,#0
+	umlal	r10,r12,r6,r8	@ np[0]*n0+"tp[0]"
+	mov	r4,sp
+
+Linner:
+	ldr	r5,[r1],#4		@ ap[j],ap++
+	adds	r10,r11,r7		@ +=tp[j]
+	ldr	r6,[r3],#4		@ np[j],np++
+	mov	r11,#0
+	umlal	r10,r11,r5,r2	@ ap[j]*bp[i]
+	mov	r14,#0
+	umlal	r12,r14,r6,r8	@ np[j]*n0
+	adc	r11,r11,#0
+	ldr	r7,[r4,#8]		@ tp[j+1]
+	adds	r12,r12,r10
+	str	r12,[r4],#4		@ tp[j-1]=,tp++
+	adc	r12,r14,#0
+	cmp	r4,r0
+	bne	Linner
+
+	adds	r12,r12,r11
+	mov	r14,#0
+	ldr	r4,[r0,#13*4]		@ restore bp
+	adc	r14,r14,#0
+	ldr	r8,[r0,#14*4]		@ restore n0
+	adds	r12,r12,r7
+	ldr	r7,[r0,#15*4]		@ restore &bp[num]
+	adc	r14,r14,#0
+	str	r12,[r0]		@ tp[num-1]=
+	str	r14,[r0,#4]		@ tp[num]=
+
+	cmp	r4,r7
+#ifdef	__thumb2__
+	itt	ne
+#endif
+	movne	r7,sp
+	bne	Louter
+
+	ldr	r2,[r0,#12*4]		@ pull rp
+	mov	r5,sp
+	add	r0,r0,#4		@ r0 to point at &tp[num]
+	sub	r5,r0,r5		@ "original" num value
+	mov	r4,sp			@ "rewind" r4
+	mov	r1,r4			@ "borrow" r1
+	sub	r3,r3,r5		@ "rewind" r3 to &np[0]
+
+	subs	r7,r7,r7		@ "clear" carry flag
+Lsub:	ldr	r7,[r4],#4
+	ldr	r6,[r3],#4
+	sbcs	r7,r7,r6		@ tp[j]-np[j]
+	str	r7,[r2],#4		@ rp[j]=
+	teq	r4,r0		@ preserve carry
+	bne	Lsub
+	sbcs	r14,r14,#0		@ upmost carry
+	mov	r4,sp			@ "rewind" r4
+	sub	r2,r2,r5		@ "rewind" r2
+
+Lcopy:	ldr	r7,[r4]		@ conditional copy
+	ldr	r5,[r2]
+	str	sp,[r4],#4		@ zap tp
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	r5,r7
+	str	r5,[r2],#4
+	teq	r4,r0		@ preserve carry
+	bne	Lcopy
+
+	mov	sp,r0
+	add	sp,sp,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+Labrt:
+#if __ARM_ARCH__>=5
+	bx	lr				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func	bn_mul8x_mont_neon
+#endif
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+	ldmia	ip,{r4,r5}		@ load rest of parameter block
+	mov	ip,sp
+
+	cmp	r5,#8
+	bhi	LNEON_8n
+
+	@ special case for r5==8, everything is in register bank...
+
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	sub	r7,sp,r5,lsl#4
+	vld1.32	{d0,d1,d2,d3},  [r1]!		@ can't specify :32 :-(
+	and	r7,r7,#-64
+	vld1.32	{d30[0]}, [r4,:32]
+	mov	sp,r7			@ alloca
+	vzip.16	d28,d8
+
+	vmull.u32	q6,d28,d0[0]
+	vmull.u32	q7,d28,d0[1]
+	vmull.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmull.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	vmul.u32	d29,d29,d30
+
+	vmull.u32	q10,d28,d2[0]
+	vld1.32	{d4,d5,d6,d7}, [r3]!
+	vmull.u32	q11,d28,d2[1]
+	vmull.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmull.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	sub	r9,r5,#1
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	b	LNEON_outer8
+
+.align	4
+LNEON_outer8:
+	vld1.32	{d28[0]}, [r2,:32]!
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	vadd.u64	d12,d12,d10
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+
+	vadd.u64	d29,d29,d12
+	veor	d8,d8,d8
+	subs	r9,r9,#1
+	vmul.u32	d29,d29,d30
+
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+
+	vmlal.u32	q10,d29,d6[0]
+	vmov	q5,q6
+	vmlal.u32	q11,d29,d6[1]
+	vmov	q6,q7
+	vmlal.u32	q12,d29,d7[0]
+	vmov	q7,q8
+	vmlal.u32	q13,d29,d7[1]
+	vmov	q8,q9
+	vmov	q9,q10
+	vshr.u64	d10,d10,#16
+	vmov	q10,q11
+	vmov	q11,q12
+	vadd.u64	d10,d10,d11
+	vmov	q12,q13
+	veor	q13,q13
+	vshr.u64	d10,d10,#16
+
+	bne	LNEON_outer8
+
+	vadd.u64	d12,d12,d10
+	mov	r7,sp
+	vshr.u64	d10,d12,#16
+	mov	r8,r5
+	vadd.u64	d13,d13,d10
+	add	r6,sp,#96
+	vshr.u64	d10,d13,#16
+	vzip.16	d12,d13
+
+	b	LNEON_tail_entry
+
+.align	4
+LNEON_8n:
+	veor	q6,q6,q6
+	sub	r7,sp,#128
+	veor	q7,q7,q7
+	sub	r7,r7,r5,lsl#4
+	veor	q8,q8,q8
+	and	r7,r7,#-64
+	veor	q9,q9,q9
+	mov	sp,r7			@ alloca
+	veor	q10,q10,q10
+	add	r7,r7,#256
+	veor	q11,q11,q11
+	sub	r8,r5,#8
+	veor	q12,q12,q12
+	veor	q13,q13,q13
+
+LNEON_8n_init:
+	vst1.64	{q6,q7},[r7,:256]!
+	subs	r8,r8,#8
+	vst1.64	{q8,q9},[r7,:256]!
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12,q13},[r7,:256]!
+	bne	LNEON_8n_init
+
+	add	r6,sp,#256
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	add	r10,sp,#8
+	vld1.32	{d30[0]},[r4,:32]
+	mov	r9,r5
+	b	LNEON_8n_outer
+
+.align	4
+LNEON_8n_outer:
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	veor	d8,d8,d8
+	vzip.16	d28,d8
+	add	r7,sp,#128
+	vld1.32	{d4,d5,d6,d7},[r3]!
+
+	vmlal.u32	q6,d28,d0[0]
+	vmlal.u32	q7,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q8,d28,d1[0]
+	vshl.i64	d29,d13,#16
+	vmlal.u32	q9,d28,d1[1]
+	vadd.u64	d29,d29,d12
+	vmlal.u32	q10,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q11,d28,d2[1]
+	vst1.32	{d28},[sp,:64]		@ put aside smashed b[8*i+0]
+	vmlal.u32	q12,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q6,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q7,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q8,d29,d5[0]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vadd.u64	d12,d12,d13
+	vmlal.u32	q11,d29,d6[1]
+	vshr.u64	d12,d12,#16
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vadd.u64	d14,d14,d12
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+0]
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]!
+	vmlal.u32	q8,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q9,d28,d1[0]
+	vshl.i64	d29,d15,#16
+	vmlal.u32	q10,d28,d1[1]
+	vadd.u64	d29,d29,d14
+	vmlal.u32	q11,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q12,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+1]
+	vmlal.u32	q13,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q7,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q8,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q9,d29,d5[0]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vadd.u64	d14,d14,d15
+	vmlal.u32	q12,d29,d6[1]
+	vshr.u64	d14,d14,#16
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vadd.u64	d16,d16,d14
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+1]
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]!
+	vmlal.u32	q9,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q10,d28,d1[0]
+	vshl.i64	d29,d17,#16
+	vmlal.u32	q11,d28,d1[1]
+	vadd.u64	d29,d29,d16
+	vmlal.u32	q12,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q13,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+2]
+	vmlal.u32	q6,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q8,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q9,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q10,d29,d5[0]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vadd.u64	d16,d16,d17
+	vmlal.u32	q13,d29,d6[1]
+	vshr.u64	d16,d16,#16
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vadd.u64	d18,d18,d16
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+2]
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]!
+	vmlal.u32	q10,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q11,d28,d1[0]
+	vshl.i64	d29,d19,#16
+	vmlal.u32	q12,d28,d1[1]
+	vadd.u64	d29,d29,d18
+	vmlal.u32	q13,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q6,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+3]
+	vmlal.u32	q7,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q9,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q10,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q11,d29,d5[0]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vadd.u64	d18,d18,d19
+	vmlal.u32	q6,d29,d6[1]
+	vshr.u64	d18,d18,#16
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vadd.u64	d20,d20,d18
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+3]
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]!
+	vmlal.u32	q11,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q12,d28,d1[0]
+	vshl.i64	d29,d21,#16
+	vmlal.u32	q13,d28,d1[1]
+	vadd.u64	d29,d29,d20
+	vmlal.u32	q6,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q7,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+4]
+	vmlal.u32	q8,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q10,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q11,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q12,d29,d5[0]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vadd.u64	d20,d20,d21
+	vmlal.u32	q7,d29,d6[1]
+	vshr.u64	d20,d20,#16
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vadd.u64	d22,d22,d20
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+4]
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]!
+	vmlal.u32	q12,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q13,d28,d1[0]
+	vshl.i64	d29,d23,#16
+	vmlal.u32	q6,d28,d1[1]
+	vadd.u64	d29,d29,d22
+	vmlal.u32	q7,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q8,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+5]
+	vmlal.u32	q9,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q11,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q12,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q13,d29,d5[0]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vadd.u64	d22,d22,d23
+	vmlal.u32	q8,d29,d6[1]
+	vshr.u64	d22,d22,#16
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vadd.u64	d24,d24,d22
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+5]
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]!
+	vmlal.u32	q13,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q6,d28,d1[0]
+	vshl.i64	d29,d25,#16
+	vmlal.u32	q7,d28,d1[1]
+	vadd.u64	d29,d29,d24
+	vmlal.u32	q8,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q9,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+6]
+	vmlal.u32	q10,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28[0]},[r2,:32]!	@ *b++
+	vmlal.u32	q12,d29,d4[0]
+	veor	d10,d10,d10
+	vmlal.u32	q13,d29,d4[1]
+	vzip.16	d28,d10
+	vmlal.u32	q6,d29,d5[0]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vadd.u64	d24,d24,d25
+	vmlal.u32	q9,d29,d6[1]
+	vshr.u64	d24,d24,#16
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vadd.u64	d26,d26,d24
+	vst1.32	{d29},[r10,:64]!	@ put aside smashed m[8*i+6]
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]!
+	vmlal.u32	q6,d28,d0[1]
+	veor	d8,d8,d8
+	vmlal.u32	q7,d28,d1[0]
+	vshl.i64	d29,d27,#16
+	vmlal.u32	q8,d28,d1[1]
+	vadd.u64	d29,d29,d26
+	vmlal.u32	q9,d28,d2[0]
+	vmul.u32	d29,d29,d30
+	vmlal.u32	q10,d28,d2[1]
+	vst1.32	{d28},[r10,:64]!	@ put aside smashed b[8*i+7]
+	vmlal.u32	q11,d28,d3[0]
+	vzip.16	d29,d8
+	vmlal.u32	q12,d28,d3[1]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q6,d29,d4[1]
+	vmlal.u32	q7,d29,d5[0]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vadd.u64	d26,d26,d27
+	vmlal.u32	q10,d29,d6[1]
+	vshr.u64	d26,d26,#16
+	vmlal.u32	q11,d29,d7[0]
+	vmlal.u32	q12,d29,d7[1]
+	vadd.u64	d12,d12,d26
+	vst1.32	{d29},[r10,:64]	@ put aside smashed m[8*i+7]
+	add	r10,sp,#8		@ rewind
+	sub	r8,r5,#8
+	b	LNEON_8n_inner
+
+.align	4
+LNEON_8n_inner:
+	subs	r8,r8,#8
+	vmlal.u32	q6,d28,d0[0]
+	vld1.64	{q13},[r6,:128]
+	vmlal.u32	q7,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+0]
+	vmlal.u32	q8,d28,d1[0]
+	vld1.32	{d4,d5,d6,d7},[r3]!
+	vmlal.u32	q9,d28,d1[1]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d2[0]
+	vmlal.u32	q11,d28,d2[1]
+	vmlal.u32	q12,d28,d3[0]
+	vmlal.u32	q13,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+1]
+	vmlal.u32	q6,d29,d4[0]
+	vmlal.u32	q7,d29,d4[1]
+	vmlal.u32	q8,d29,d5[0]
+	vmlal.u32	q9,d29,d5[1]
+	vmlal.u32	q10,d29,d6[0]
+	vmlal.u32	q11,d29,d6[1]
+	vmlal.u32	q12,d29,d7[0]
+	vmlal.u32	q13,d29,d7[1]
+	vst1.64	{q6},[r7,:128]!
+	vmlal.u32	q7,d28,d0[0]
+	vld1.64	{q6},[r6,:128]
+	vmlal.u32	q8,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+1]
+	vmlal.u32	q9,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q10,d28,d1[1]
+	vmlal.u32	q11,d28,d2[0]
+	vmlal.u32	q12,d28,d2[1]
+	vmlal.u32	q13,d28,d3[0]
+	vmlal.u32	q6,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+2]
+	vmlal.u32	q7,d29,d4[0]
+	vmlal.u32	q8,d29,d4[1]
+	vmlal.u32	q9,d29,d5[0]
+	vmlal.u32	q10,d29,d5[1]
+	vmlal.u32	q11,d29,d6[0]
+	vmlal.u32	q12,d29,d6[1]
+	vmlal.u32	q13,d29,d7[0]
+	vmlal.u32	q6,d29,d7[1]
+	vst1.64	{q7},[r7,:128]!
+	vmlal.u32	q8,d28,d0[0]
+	vld1.64	{q7},[r6,:128]
+	vmlal.u32	q9,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+2]
+	vmlal.u32	q10,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q11,d28,d1[1]
+	vmlal.u32	q12,d28,d2[0]
+	vmlal.u32	q13,d28,d2[1]
+	vmlal.u32	q6,d28,d3[0]
+	vmlal.u32	q7,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+3]
+	vmlal.u32	q8,d29,d4[0]
+	vmlal.u32	q9,d29,d4[1]
+	vmlal.u32	q10,d29,d5[0]
+	vmlal.u32	q11,d29,d5[1]
+	vmlal.u32	q12,d29,d6[0]
+	vmlal.u32	q13,d29,d6[1]
+	vmlal.u32	q6,d29,d7[0]
+	vmlal.u32	q7,d29,d7[1]
+	vst1.64	{q8},[r7,:128]!
+	vmlal.u32	q9,d28,d0[0]
+	vld1.64	{q8},[r6,:128]
+	vmlal.u32	q10,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+3]
+	vmlal.u32	q11,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q12,d28,d1[1]
+	vmlal.u32	q13,d28,d2[0]
+	vmlal.u32	q6,d28,d2[1]
+	vmlal.u32	q7,d28,d3[0]
+	vmlal.u32	q8,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+4]
+	vmlal.u32	q9,d29,d4[0]
+	vmlal.u32	q10,d29,d4[1]
+	vmlal.u32	q11,d29,d5[0]
+	vmlal.u32	q12,d29,d5[1]
+	vmlal.u32	q13,d29,d6[0]
+	vmlal.u32	q6,d29,d6[1]
+	vmlal.u32	q7,d29,d7[0]
+	vmlal.u32	q8,d29,d7[1]
+	vst1.64	{q9},[r7,:128]!
+	vmlal.u32	q10,d28,d0[0]
+	vld1.64	{q9},[r6,:128]
+	vmlal.u32	q11,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+4]
+	vmlal.u32	q12,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q13,d28,d1[1]
+	vmlal.u32	q6,d28,d2[0]
+	vmlal.u32	q7,d28,d2[1]
+	vmlal.u32	q8,d28,d3[0]
+	vmlal.u32	q9,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+5]
+	vmlal.u32	q10,d29,d4[0]
+	vmlal.u32	q11,d29,d4[1]
+	vmlal.u32	q12,d29,d5[0]
+	vmlal.u32	q13,d29,d5[1]
+	vmlal.u32	q6,d29,d6[0]
+	vmlal.u32	q7,d29,d6[1]
+	vmlal.u32	q8,d29,d7[0]
+	vmlal.u32	q9,d29,d7[1]
+	vst1.64	{q10},[r7,:128]!
+	vmlal.u32	q11,d28,d0[0]
+	vld1.64	{q10},[r6,:128]
+	vmlal.u32	q12,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+5]
+	vmlal.u32	q13,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q6,d28,d1[1]
+	vmlal.u32	q7,d28,d2[0]
+	vmlal.u32	q8,d28,d2[1]
+	vmlal.u32	q9,d28,d3[0]
+	vmlal.u32	q10,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+6]
+	vmlal.u32	q11,d29,d4[0]
+	vmlal.u32	q12,d29,d4[1]
+	vmlal.u32	q13,d29,d5[0]
+	vmlal.u32	q6,d29,d5[1]
+	vmlal.u32	q7,d29,d6[0]
+	vmlal.u32	q8,d29,d6[1]
+	vmlal.u32	q9,d29,d7[0]
+	vmlal.u32	q10,d29,d7[1]
+	vst1.64	{q11},[r7,:128]!
+	vmlal.u32	q12,d28,d0[0]
+	vld1.64	{q11},[r6,:128]
+	vmlal.u32	q13,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+6]
+	vmlal.u32	q6,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q7,d28,d1[1]
+	vmlal.u32	q8,d28,d2[0]
+	vmlal.u32	q9,d28,d2[1]
+	vmlal.u32	q10,d28,d3[0]
+	vmlal.u32	q11,d28,d3[1]
+	vld1.32	{d28},[r10,:64]!	@ pull smashed b[8*i+7]
+	vmlal.u32	q12,d29,d4[0]
+	vmlal.u32	q13,d29,d4[1]
+	vmlal.u32	q6,d29,d5[0]
+	vmlal.u32	q7,d29,d5[1]
+	vmlal.u32	q8,d29,d6[0]
+	vmlal.u32	q9,d29,d6[1]
+	vmlal.u32	q10,d29,d7[0]
+	vmlal.u32	q11,d29,d7[1]
+	vst1.64	{q12},[r7,:128]!
+	vmlal.u32	q13,d28,d0[0]
+	vld1.64	{q12},[r6,:128]
+	vmlal.u32	q6,d28,d0[1]
+	vld1.32	{d29},[r10,:64]!	@ pull smashed m[8*i+7]
+	vmlal.u32	q7,d28,d1[0]
+	it	ne
+	addne	r6,r6,#16	@ don't advance in last iteration
+	vmlal.u32	q8,d28,d1[1]
+	vmlal.u32	q9,d28,d2[0]
+	vmlal.u32	q10,d28,d2[1]
+	vmlal.u32	q11,d28,d3[0]
+	vmlal.u32	q12,d28,d3[1]
+	it	eq
+	subeq	r1,r1,r5,lsl#2	@ rewind
+	vmlal.u32	q13,d29,d4[0]
+	vld1.32	{d28},[sp,:64]		@ pull smashed b[8*i+0]
+	vmlal.u32	q6,d29,d4[1]
+	vld1.32	{d0,d1,d2,d3},[r1]!
+	vmlal.u32	q7,d29,d5[0]
+	add	r10,sp,#8		@ rewind
+	vmlal.u32	q8,d29,d5[1]
+	vmlal.u32	q9,d29,d6[0]
+	vmlal.u32	q10,d29,d6[1]
+	vmlal.u32	q11,d29,d7[0]
+	vst1.64	{q13},[r7,:128]!
+	vmlal.u32	q12,d29,d7[1]
+
+	bne	LNEON_8n_inner
+	add	r6,sp,#128
+	vst1.64	{q6,q7},[r7,:256]!
+	veor	q2,q2,q2		@ d4-d5
+	vst1.64	{q8,q9},[r7,:256]!
+	veor	q3,q3,q3		@ d6-d7
+	vst1.64	{q10,q11},[r7,:256]!
+	vst1.64	{q12},[r7,:128]
+
+	subs	r9,r9,#8
+	vld1.64	{q6,q7},[r6,:256]!
+	vld1.64	{q8,q9},[r6,:256]!
+	vld1.64	{q10,q11},[r6,:256]!
+	vld1.64	{q12,q13},[r6,:256]!
+
+	itt	ne
+	subne	r3,r3,r5,lsl#2	@ rewind
+	bne	LNEON_8n_outer
+
+	add	r7,sp,#128
+	vst1.64	{q2,q3}, [sp,:256]!	@ start wiping stack frame
+	vshr.u64	d10,d12,#16
+	vst1.64	{q2,q3},[sp,:256]!
+	vadd.u64	d13,d13,d10
+	vst1.64	{q2,q3}, [sp,:256]!
+	vshr.u64	d10,d13,#16
+	vst1.64	{q2,q3}, [sp,:256]!
+	vzip.16	d12,d13
+
+	mov	r8,r5
+	b	LNEON_tail_entry
+
+.align	4
+LNEON_tail:
+	vadd.u64	d12,d12,d10
+	vshr.u64	d10,d12,#16
+	vld1.64	{q8,q9}, [r6, :256]!
+	vadd.u64	d13,d13,d10
+	vld1.64	{q10,q11}, [r6, :256]!
+	vshr.u64	d10,d13,#16
+	vld1.64	{q12,q13}, [r6, :256]!
+	vzip.16	d12,d13
+
+LNEON_tail_entry:
+	vadd.u64	d14,d14,d10
+	vst1.32	{d12[0]}, [r7, :32]!
+	vshr.u64	d10,d14,#16
+	vadd.u64	d15,d15,d10
+	vshr.u64	d10,d15,#16
+	vzip.16	d14,d15
+	vadd.u64	d16,d16,d10
+	vst1.32	{d14[0]}, [r7, :32]!
+	vshr.u64	d10,d16,#16
+	vadd.u64	d17,d17,d10
+	vshr.u64	d10,d17,#16
+	vzip.16	d16,d17
+	vadd.u64	d18,d18,d10
+	vst1.32	{d16[0]}, [r7, :32]!
+	vshr.u64	d10,d18,#16
+	vadd.u64	d19,d19,d10
+	vshr.u64	d10,d19,#16
+	vzip.16	d18,d19
+	vadd.u64	d20,d20,d10
+	vst1.32	{d18[0]}, [r7, :32]!
+	vshr.u64	d10,d20,#16
+	vadd.u64	d21,d21,d10
+	vshr.u64	d10,d21,#16
+	vzip.16	d20,d21
+	vadd.u64	d22,d22,d10
+	vst1.32	{d20[0]}, [r7, :32]!
+	vshr.u64	d10,d22,#16
+	vadd.u64	d23,d23,d10
+	vshr.u64	d10,d23,#16
+	vzip.16	d22,d23
+	vadd.u64	d24,d24,d10
+	vst1.32	{d22[0]}, [r7, :32]!
+	vshr.u64	d10,d24,#16
+	vadd.u64	d25,d25,d10
+	vshr.u64	d10,d25,#16
+	vzip.16	d24,d25
+	vadd.u64	d26,d26,d10
+	vst1.32	{d24[0]}, [r7, :32]!
+	vshr.u64	d10,d26,#16
+	vadd.u64	d27,d27,d10
+	vshr.u64	d10,d27,#16
+	vzip.16	d26,d27
+	vld1.64	{q6,q7}, [r6, :256]!
+	subs	r8,r8,#8
+	vst1.32	{d26[0]},   [r7, :32]!
+	bne	LNEON_tail
+
+	vst1.32	{d10[0]}, [r7, :32]		@ top-most bit
+	sub	r3,r3,r5,lsl#2			@ rewind r3
+	subs	r1,sp,#0				@ clear carry flag
+	add	r2,sp,r5,lsl#2
+
+LNEON_sub:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r3!, {r8,r9,r10,r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	LNEON_sub
+
+	ldr	r10, [r1]				@ load top-most bit
+	mov	r11,sp
+	veor	q0,q0,q0
+	sub	r11,r2,r11				@ this is num*4
+	veor	q1,q1,q1
+	mov	r1,sp
+	sub	r0,r0,r11				@ rewind r0
+	mov	r3,r2				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+LNEON_copy_n_zap:
+	ldmia	r1!, {r4,r5,r6,r7}
+	ldmia	r0,  {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	ldmia	r1, {r4,r5,r6,r7}
+	stmia	r0!, {r8,r9,r10,r11}
+	sub	r1,r1,#16
+	ldmia	r0, {r8,r9,r10,r11}
+	it	cc
+	movcc	r8, r4
+	vst1.64	{q0,q1}, [r1,:256]!			@ wipe
+	itt	cc
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0,q1}, [r3,:256]!			@ wipe
+	it	cc
+	movcc	r11,r7
+	teq	r1,r2				@ preserves carry
+	stmia	r0!, {r8,r9,r10,r11}
+	bne	LNEON_copy_n_zap
+
+	mov	sp,ip
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	bx	lr						@ bx lr
+
+#endif
+.byte	77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/bsaes-armv7.S b/apple-arm/crypto/fipsmodule/bsaes-armv7.S
new file mode 100644
index 0000000..8329a8c
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -0,0 +1,1536 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt	19.5 cycles per byte processed with 128-bit key
+@ decrypt	22.1 cycles per byte processed with 128-bit key
+@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@						<appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+# define VFP_ABI_FRAME	0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME	0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.text
+.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code	32
+# undef __thumb2__
+#endif
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_decrypt8
+#endif
+.align	4
+_bsaes_decrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,LM0ISR
+#else
+	add	r6,r6,#LM0ISR-_bsaes_decrypt8
+#endif
+
+	vldmia	r6!, {q8}		@ LM0ISR
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	Ldec_sbox
+.align	4
+Ldec_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+Ldec_sbox:
+	veor	q1, q1, q4
+	veor	q3, q3, q4
+
+	veor	q4, q4, q7
+	veor	q1, q1, q6
+	veor	q2, q2, q7
+	veor	q6, q6, q4
+
+	veor	q0, q0, q1
+	veor	q2, q2, q5
+	veor	q7, q7, q6
+	veor	q3, q3, q0
+	veor	q5, q5, q0
+	veor	q1, q1, q3
+	veor	q11, q3, q0
+	veor	q10, q7, q4
+	veor	q9, q1, q6
+	veor	q13, q4, q0
+	vmov	q8, q10
+	veor	q12, q5, q2
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q6, q2
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q3, q7
+	veor	q12, q1, q5
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q4, q6
+	veor	q9, q9, q14
+	vand	q13, q0, q2
+	vand	q14, q7, q1
+	vorr	q15, q3, q5
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q5, q2
+	veor	q8, q1, q6
+	veor	q10, q15, q14
+	vand	q10, q10, q5
+	veor	q5, q5, q1
+	vand	q11, q1, q15
+	vand	q5, q5, q14
+	veor	q1, q11, q10
+	veor	q5, q5, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q2
+	veor	q12, q12, q8
+	veor	q2, q2, q6
+	vand	q8, q8, q15
+	vand	q6, q6, q13
+	vand	q12, q12, q14
+	vand	q2, q2, q9
+	veor	q8, q8, q12
+	veor	q2, q2, q6
+	veor	q12, q12, q11
+	veor	q6, q6, q10
+	veor	q5, q5, q12
+	veor	q2, q2, q12
+	veor	q1, q1, q8
+	veor	q6, q6, q8
+
+	veor	q12, q3, q0
+	veor	q8, q7, q4
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q4
+	vand	q8, q8, q15
+	vand	q4, q4, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q4
+	veor	q12, q12, q11
+	veor	q4, q4, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q3
+	veor	q3, q3, q7
+	vand	q11, q7, q15
+	vand	q3, q3, q14
+	veor	q7, q11, q10
+	veor	q3, q3, q11
+	veor	q3, q3, q12
+	veor	q0, q0, q12
+	veor	q7, q7, q8
+	veor	q4, q4, q8
+	veor	q1, q1, q7
+	veor	q6, q6, q5
+
+	veor	q4, q4, q1
+	veor	q2, q2, q7
+	veor	q5, q5, q7
+	veor	q4, q4, q2
+	veor	q7, q7, q0
+	veor	q4, q4, q5
+	veor	q3, q3, q6
+	veor	q6, q6, q1
+	veor	q3, q3, q4
+
+	veor	q4, q4, q0
+	veor	q7, q7, q3
+	subs	r5,r5,#1
+	bcc	Ldec_done
+	@ multiplication by 0x05-0x00-0x04-0x00
+	vext.8	q8, q0, q0, #8
+	vext.8	q14, q3, q3, #8
+	vext.8	q15, q5, q5, #8
+	veor	q8, q8, q0
+	vext.8	q9, q1, q1, #8
+	veor	q14, q14, q3
+	vext.8	q10, q6, q6, #8
+	veor	q15, q15, q5
+	vext.8	q11, q4, q4, #8
+	veor	q9, q9, q1
+	vext.8	q12, q2, q2, #8
+	veor	q10, q10, q6
+	vext.8	q13, q7, q7, #8
+	veor	q11, q11, q4
+	veor	q12, q12, q2
+	veor	q13, q13, q7
+
+	veor	q0, q0, q14
+	veor	q1, q1, q14
+	veor	q6, q6, q8
+	veor	q2, q2, q10
+	veor	q4, q4, q9
+	veor	q1, q1, q15
+	veor	q6, q6, q15
+	veor	q2, q2, q14
+	veor	q7, q7, q11
+	veor	q4, q4, q14
+	veor	q3, q3, q12
+	veor	q2, q2, q15
+	veor	q7, q7, q15
+	veor	q5, q5, q13
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q6, q6, #12
+	veor	q1, q1, q9
+	vext.8	q11, q4, q4, #12
+	veor	q6, q6, q10
+	vext.8	q12, q2, q2, #12
+	veor	q4, q4, q11
+	vext.8	q13, q7, q7, #12
+	veor	q2, q2, q12
+	vext.8	q14, q3, q3, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q3, q3, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q2
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q2, q2, #8
+	veor	q12, q12, q4
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q3
+	vext.8	q2, q4, q4, #8
+	veor	q11, q11, q6
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q4, q3, q3, #8
+	veor	q11, q11, q5
+	vext.8	q3, q6, q6, #8
+	veor	q5, q9, q13
+	veor	q11, q11, q2
+	veor	q7, q7, q15
+	veor	q6, q4, q14
+	veor	q4, q8, q12
+	veor	q2, q3, q10
+	vmov	q3, q11
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ LISR
+	ite	eq				@ Thumb2 thing, sanity check in ARM
+	addeq	r6,r6,#0x10
+	bne	Ldec_loop
+	vldmia	r6, {q12}		@ LISRM0
+	b	Ldec_loop
+.align	4
+Ldec_done:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q3, #1
+	vshr.u64	q11, q2, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q4
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q2, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q4
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q4, q4, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q4, #4
+	vshr.u64	q11, q6, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q4, q4, q10
+	veor	q6, q6, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q6, q6, q8
+	veor	q4, q4, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q8
+	veor	q3, q3, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+
+
+
+.align	6
+_bsaes_const:
+LM0ISR:@ InvShiftRows constants
+.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+LISR:
+.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+LISRM0:
+.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+LM0SR:@ ShiftRows constants
+.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+LSR:
+.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+LSRM0:
+.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+LM0:
+.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+LREVM0SR:
+.quad	0x090d01050c000408, 0x03070b0f060a0e02
+.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	6
+
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_encrypt8
+#endif
+.align	4
+_bsaes_encrypt8:
+	adr	r6,.
+	vldmia	r4!, {q9}		@ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,LM0SR
+#else
+	sub	r6,r6,#_bsaes_encrypt8-LM0SR
+#endif
+
+	vldmia	r6!, {q8}		@ LM0SR
+_bsaes_encrypt8_alt:
+	veor	q10, q0, q9	@ xor with round0 key
+	veor	q11, q1, q9
+	vtbl.8	d0, {q10}, d16
+	vtbl.8	d1, {q10}, d17
+	veor	q12, q2, q9
+	vtbl.8	d2, {q11}, d16
+	vtbl.8	d3, {q11}, d17
+	veor	q13, q3, q9
+	vtbl.8	d4, {q12}, d16
+	vtbl.8	d5, {q12}, d17
+	veor	q14, q4, q9
+	vtbl.8	d6, {q13}, d16
+	vtbl.8	d7, {q13}, d17
+	veor	q15, q5, q9
+	vtbl.8	d8, {q14}, d16
+	vtbl.8	d9, {q14}, d17
+	veor	q10, q6, q9
+	vtbl.8	d10, {q15}, d16
+	vtbl.8	d11, {q15}, d17
+	veor	q11, q7, q9
+	vtbl.8	d12, {q10}, d16
+	vtbl.8	d13, {q10}, d17
+	vtbl.8	d14, {q11}, d16
+	vtbl.8	d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q6, #1
+	vshr.u64	q11, q4, #1
+	veor	q10, q10, q7
+	veor	q11, q11, q5
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #1
+	veor	q5, q5, q11
+	vshl.u64	q11, q11, #1
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q3
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q5, #2
+	vshr.u64	q11, q4, #2
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #2
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #2
+	veor	q5, q5, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q3
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q3, q3, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q3, #4
+	vshr.u64	q11, q2, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q6
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q6, q6, q11
+	vshl.u64	q11, q11, #4
+	veor	q3, q3, q10
+	veor	q2, q2, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q4
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	sub	r5,r5,#1
+	b	Lenc_sbox
+.align	4
+Lenc_loop:
+	vldmia	r4!, {q8,q9,q10,q11}
+	veor	q8, q8, q0
+	veor	q9, q9, q1
+	vtbl.8	d0, {q8}, d24
+	vtbl.8	d1, {q8}, d25
+	vldmia	r4!, {q8}
+	veor	q10, q10, q2
+	vtbl.8	d2, {q9}, d24
+	vtbl.8	d3, {q9}, d25
+	vldmia	r4!, {q9}
+	veor	q11, q11, q3
+	vtbl.8	d4, {q10}, d24
+	vtbl.8	d5, {q10}, d25
+	vldmia	r4!, {q10}
+	vtbl.8	d6, {q11}, d24
+	vtbl.8	d7, {q11}, d25
+	vldmia	r4!, {q11}
+	veor	q8, q8, q4
+	veor	q9, q9, q5
+	vtbl.8	d8, {q8}, d24
+	vtbl.8	d9, {q8}, d25
+	veor	q10, q10, q6
+	vtbl.8	d10, {q9}, d24
+	vtbl.8	d11, {q9}, d25
+	veor	q11, q11, q7
+	vtbl.8	d12, {q10}, d24
+	vtbl.8	d13, {q10}, d25
+	vtbl.8	d14, {q11}, d24
+	vtbl.8	d15, {q11}, d25
+Lenc_sbox:
+	veor	q2, q2, q1
+	veor	q5, q5, q6
+	veor	q3, q3, q0
+	veor	q6, q6, q2
+	veor	q5, q5, q0
+
+	veor	q6, q6, q3
+	veor	q3, q3, q7
+	veor	q7, q7, q5
+	veor	q3, q3, q4
+	veor	q4, q4, q5
+
+	veor	q2, q2, q7
+	veor	q3, q3, q1
+	veor	q1, q1, q5
+	veor	q11, q7, q4
+	veor	q10, q1, q2
+	veor	q9, q5, q3
+	veor	q13, q2, q4
+	vmov	q8, q10
+	veor	q12, q6, q0
+
+	vorr	q10, q10, q9
+	veor	q15, q11, q8
+	vand	q14, q11, q12
+	vorr	q11, q11, q12
+	veor	q12, q12, q9
+	vand	q8, q8, q9
+	veor	q9, q3, q0
+	vand	q15, q15, q12
+	vand	q13, q13, q9
+	veor	q9, q7, q1
+	veor	q12, q5, q6
+	veor	q11, q11, q13
+	veor	q10, q10, q13
+	vand	q13, q9, q12
+	vorr	q9, q9, q12
+	veor	q11, q11, q15
+	veor	q8, q8, q13
+	veor	q10, q10, q14
+	veor	q9, q9, q15
+	veor	q8, q8, q14
+	vand	q12, q2, q3
+	veor	q9, q9, q14
+	vand	q13, q4, q0
+	vand	q14, q1, q5
+	vorr	q15, q7, q6
+	veor	q11, q11, q12
+	veor	q9, q9, q14
+	veor	q8, q8, q15
+	veor	q10, q10, q13
+
+	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
+
+	@ new smaller inversion
+
+	vand	q14, q11, q9
+	vmov	q12, q8
+
+	veor	q13, q10, q14
+	veor	q15, q8, q14
+	veor	q14, q8, q14	@ q14=q15
+
+	vbsl	q13, q9, q8
+	vbsl	q15, q11, q10
+	veor	q11, q11, q10
+
+	vbsl	q12, q13, q14
+	vbsl	q8, q14, q13
+
+	vand	q14, q12, q15
+	veor	q9, q9, q8
+
+	veor	q14, q14, q11
+	veor	q12, q6, q0
+	veor	q8, q5, q3
+	veor	q10, q15, q14
+	vand	q10, q10, q6
+	veor	q6, q6, q5
+	vand	q11, q5, q15
+	vand	q6, q6, q14
+	veor	q5, q11, q10
+	veor	q6, q6, q11
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q0
+	veor	q12, q12, q8
+	veor	q0, q0, q3
+	vand	q8, q8, q15
+	vand	q3, q3, q13
+	vand	q12, q12, q14
+	vand	q0, q0, q9
+	veor	q8, q8, q12
+	veor	q0, q0, q3
+	veor	q12, q12, q11
+	veor	q3, q3, q10
+	veor	q6, q6, q12
+	veor	q0, q0, q12
+	veor	q5, q5, q8
+	veor	q3, q3, q8
+
+	veor	q12, q7, q4
+	veor	q8, q1, q2
+	veor	q11, q15, q14
+	veor	q10, q13, q9
+	vand	q11, q11, q12
+	vand	q10, q10, q4
+	veor	q12, q12, q8
+	veor	q4, q4, q2
+	vand	q8, q8, q15
+	vand	q2, q2, q13
+	vand	q12, q12, q14
+	vand	q4, q4, q9
+	veor	q8, q8, q12
+	veor	q4, q4, q2
+	veor	q12, q12, q11
+	veor	q2, q2, q10
+	veor	q15, q15, q13
+	veor	q14, q14, q9
+	veor	q10, q15, q14
+	vand	q10, q10, q7
+	veor	q7, q7, q1
+	vand	q11, q1, q15
+	vand	q7, q7, q14
+	veor	q1, q11, q10
+	veor	q7, q7, q11
+	veor	q7, q7, q12
+	veor	q4, q4, q12
+	veor	q1, q1, q8
+	veor	q2, q2, q8
+	veor	q7, q7, q0
+	veor	q1, q1, q6
+	veor	q6, q6, q0
+	veor	q4, q4, q7
+	veor	q0, q0, q1
+
+	veor	q1, q1, q5
+	veor	q5, q5, q2
+	veor	q2, q2, q3
+	veor	q3, q3, q5
+	veor	q4, q4, q5
+
+	veor	q6, q6, q3
+	subs	r5,r5,#1
+	bcc	Lenc_done
+	vext.8	q8, q0, q0, #12	@ x0 <<< 32
+	vext.8	q9, q1, q1, #12
+	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
+	vext.8	q10, q4, q4, #12
+	veor	q1, q1, q9
+	vext.8	q11, q6, q6, #12
+	veor	q4, q4, q10
+	vext.8	q12, q3, q3, #12
+	veor	q6, q6, q11
+	vext.8	q13, q7, q7, #12
+	veor	q3, q3, q12
+	vext.8	q14, q2, q2, #12
+	veor	q7, q7, q13
+	vext.8	q15, q5, q5, #12
+	veor	q2, q2, q14
+
+	veor	q9, q9, q0
+	veor	q5, q5, q15
+	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
+	veor	q10, q10, q1
+	veor	q8, q8, q5
+	veor	q9, q9, q5
+	vext.8	q1, q1, q1, #8
+	veor	q13, q13, q3
+	veor	q0, q0, q8
+	veor	q14, q14, q7
+	veor	q1, q1, q9
+	vext.8	q8, q3, q3, #8
+	veor	q12, q12, q6
+	vext.8	q9, q7, q7, #8
+	veor	q15, q15, q2
+	vext.8	q3, q6, q6, #8
+	veor	q11, q11, q4
+	vext.8	q7, q5, q5, #8
+	veor	q12, q12, q5
+	vext.8	q6, q2, q2, #8
+	veor	q11, q11, q5
+	vext.8	q2, q4, q4, #8
+	veor	q5, q9, q13
+	veor	q4, q8, q12
+	veor	q3, q3, q11
+	veor	q7, q7, q15
+	veor	q6, q6, q14
+	 @ vmov	q4, q8
+	veor	q2, q2, q10
+	 @ vmov	q5, q9
+	vldmia	r6, {q12}		@ LSR
+	ite	eq				@ Thumb2 thing, samity check in ARM
+	addeq	r6,r6,#0x10
+	bne	Lenc_loop
+	vldmia	r6, {q12}		@ LSRM0
+	b	Lenc_loop
+.align	4
+Lenc_done:
+	vmov.i8	q8,#0x55			@ compose LBS0
+	vmov.i8	q9,#0x33			@ compose LBS1
+	vshr.u64	q10, q2, #1
+	vshr.u64	q11, q3, #1
+	veor	q10, q10, q5
+	veor	q11, q11, q7
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #1
+	veor	q7, q7, q11
+	vshl.u64	q11, q11, #1
+	veor	q2, q2, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q4, #1
+	vshr.u64	q11, q0, #1
+	veor	q10, q10, q6
+	veor	q11, q11, q1
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #1
+	veor	q1, q1, q11
+	vshl.u64	q11, q11, #1
+	veor	q4, q4, q10
+	veor	q0, q0, q11
+	vmov.i8	q8,#0x0f			@ compose LBS2
+	vshr.u64	q10, q7, #2
+	vshr.u64	q11, q3, #2
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #2
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #2
+	veor	q7, q7, q10
+	veor	q3, q3, q11
+	vshr.u64	q10, q1, #2
+	vshr.u64	q11, q0, #2
+	veor	q10, q10, q6
+	veor	q11, q11, q4
+	vand	q10, q10, q9
+	vand	q11, q11, q9
+	veor	q6, q6, q10
+	vshl.u64	q10, q10, #2
+	veor	q4, q4, q11
+	vshl.u64	q11, q11, #2
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vshr.u64	q10, q6, #4
+	vshr.u64	q11, q4, #4
+	veor	q10, q10, q5
+	veor	q11, q11, q2
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q5, q5, q10
+	vshl.u64	q10, q10, #4
+	veor	q2, q2, q11
+	vshl.u64	q11, q11, #4
+	veor	q6, q6, q10
+	veor	q4, q4, q11
+	vshr.u64	q10, q1, #4
+	vshr.u64	q11, q0, #4
+	veor	q10, q10, q7
+	veor	q11, q11, q3
+	vand	q10, q10, q8
+	vand	q11, q11, q8
+	veor	q7, q7, q10
+	vshl.u64	q10, q10, #4
+	veor	q3, q3, q11
+	vshl.u64	q11, q11, #4
+	veor	q1, q1, q10
+	veor	q0, q0, q11
+	vldmia	r4, {q8}			@ last round key
+	veor	q4, q4, q8
+	veor	q6, q6, q8
+	veor	q3, q3, q8
+	veor	q7, q7, q8
+	veor	q2, q2, q8
+	veor	q5, q5, q8
+	veor	q0, q0, q8
+	veor	q1, q1, q8
+	bx	lr
+
+#ifdef __thumb2__
+.thumb_func	_bsaes_key_convert
+#endif
+.align	4
+_bsaes_key_convert:
+	adr	r6,.
+	vld1.8	{q7},  [r4]!		@ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+	adr	r6,LM0
+#else
+	sub	r6,r6,#_bsaes_key_convert-LM0
+#endif
+	vld1.8	{q15}, [r4]!		@ load round 1 key
+
+	vmov.i8	q8,  #0x01			@ bit masks
+	vmov.i8	q9,  #0x02
+	vmov.i8	q10, #0x04
+	vmov.i8	q11, #0x08
+	vmov.i8	q12, #0x10
+	vmov.i8	q13, #0x20
+	vldmia	r6, {q14}		@ LM0
+
+#ifdef __ARMEL__
+	vrev32.8	q7,  q7
+	vrev32.8	q15, q15
+#endif
+	sub	r5,r5,#1
+	vstmia	r12!, {q7}		@ save round 0 key
+	b	Lkey_loop
+
+.align	4
+Lkey_loop:
+	vtbl.8	d14,{q15},d28
+	vtbl.8	d15,{q15},d29
+	vmov.i8	q6,  #0x40
+	vmov.i8	q15, #0x80
+
+	vtst.8	q0, q7, q8
+	vtst.8	q1, q7, q9
+	vtst.8	q2, q7, q10
+	vtst.8	q3, q7, q11
+	vtst.8	q4, q7, q12
+	vtst.8	q5, q7, q13
+	vtst.8	q6, q7, q6
+	vtst.8	q7, q7, q15
+	vld1.8	{q15}, [r4]!		@ load next round key
+	vmvn	q0, q0		@ "pnot"
+	vmvn	q1, q1
+	vmvn	q5, q5
+	vmvn	q6, q6
+#ifdef __ARMEL__
+	vrev32.8	q15, q15
+#endif
+	subs	r5,r5,#1
+	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
+	bne	Lkey_loop
+
+	vmov.i8	q7,#0x63			@ compose L63
+	@ don't save last round key
+	bx	lr
+
+.globl	_bsaes_cbc_encrypt
+.private_extern	_bsaes_cbc_encrypt
+#ifdef __thumb2__
+.thumb_func	_bsaes_cbc_encrypt
+#endif
+.align	5
+_bsaes_cbc_encrypt:
+	@ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+	@ short inputs. We patch this out, using bsaes for all input sizes.
+
+	@ it is up to the caller to make sure we are called with enc == 0
+
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ IV is 1st arg on the stack
+	mov	r2, r2, lsr#4		@ len in 16 byte blocks
+	sub	sp, #0x10			@ scratch space to carry over the IV
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ sifze of bit-slices key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	vldmia	sp, {q6}
+	vstmia	r12,  {q15}		@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	sp, {q7}
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	add	r4, r3, #248
+	vldmia	r4, {q6}
+	vstmia	r12, {q15}			@ save last round key
+	veor	q7, q7, q6	@ fix up round 0 key
+	vstmia	r4, {q7}
+
+.align	2
+
+#endif
+
+	vld1.8	{q15}, [r8]		@ load IV
+	b	Lcbc_dec_loop
+
+.align	4
+Lcbc_dec_loop:
+	subs	r2, r2, #0x8
+	bmi	Lcbc_dec_loop_finish
+
+	vld1.8	{q0,q1}, [r0]!	@ load input
+	vld1.8	{q2,q3}, [r0]!
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	vld1.8	{q4,q5}, [r0]!
+	mov	r5, r10
+	vld1.8	{q6,q7}, [r0]
+	sub	r0, r0, #0x60
+	vstmia	r9, {q15}			@ put aside IV
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	veor	q5, q5, q14
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	vst1.8	{q5}, [r1]!
+
+	b	Lcbc_dec_loop
+
+Lcbc_dec_loop_finish:
+	adds	r2, r2, #8
+	beq	Lcbc_dec_done
+
+	@ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	mov	r4, sp			@ pass the key
+#else
+	add	r4, r3, #248
+#endif
+	mov	r5, r10
+	vstmia	r9, {q15}			@ put aside IV
+
+	vld1.8	{q0}, [r0]!		@ load input
+	cmp	r2, #2
+	blo	Lcbc_dec_one
+	vld1.8	{q1}, [r0]!
+	beq	Lcbc_dec_two
+	vld1.8	{q2}, [r0]!
+	cmp	r2, #4
+	blo	Lcbc_dec_three
+	vld1.8	{q3}, [r0]!
+	beq	Lcbc_dec_four
+	vld1.8	{q4}, [r0]!
+	cmp	r2, #6
+	blo	Lcbc_dec_five
+	vld1.8	{q5}, [r0]!
+	beq	Lcbc_dec_six
+	vld1.8	{q6}, [r0]!
+	sub	r0, r0, #0x70
+
+	bl	_bsaes_decrypt8
+
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q3, q3, q13
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	vst1.8	{q3}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_six:
+	sub	r0, r0, #0x60
+	bl	_bsaes_decrypt8
+	vldmia	r9,{q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q12}, [r0]!
+	veor	q4, q4, q10
+	veor	q2, q2, q11
+	vld1.8	{q15}, [r0]!
+	veor	q7, q7, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	vst1.8	{q7}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_five:
+	sub	r0, r0, #0x50
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10,q11}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q2, q2, q11
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	vst1.8	{q2}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_four:
+	sub	r0, r0, #0x40
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q10}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vld1.8	{q15}, [r0]!
+	veor	q4, q4, q10
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	vst1.8	{q4}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_three:
+	sub	r0, r0, #0x30
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8,q9}, [r0]!	@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!
+	veor	q1, q1, q8
+	veor	q6, q6, q9
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	vst1.8	{q6}, [r1]!
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_two:
+	sub	r0, r0, #0x20
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q8}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q1, q1, q8
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	b	Lcbc_dec_done
+.align	4
+Lcbc_dec_one:
+	sub	r0, r0, #0x10
+	bl	_bsaes_decrypt8
+	vldmia	r9, {q14}			@ reload IV
+	vld1.8	{q15}, [r0]!		@ reload input
+	veor	q0, q0, q14	@ ^= IV
+	vst1.8	{q0}, [r1]!		@ write output
+
+Lcbc_dec_done:
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+Lcbc_dec_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	Lcbc_dec_bzero
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
+	vst1.8	{q15}, [r8]		@ return IV
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+
+.globl	_bsaes_ctr32_encrypt_blocks
+.private_extern	_bsaes_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func	_bsaes_ctr32_encrypt_blocks
+#endif
+.align	5
+_bsaes_ctr32_encrypt_blocks:
+	@ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+	@ out to retain a constant-time implementation.
+	mov	ip, sp
+	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+	VFP_ABI_PUSH
+	ldr	r8, [ip]			@ ctr is 1st arg on the stack
+	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
+	mov	r9, sp				@ save sp
+
+	ldr	r10, [r3, #240]		@ get # of rounds
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	@ allocate the key schedule on the stack
+	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
+	add	r12, #96			@ size of bit-sliced key schedule
+
+	@ populate the key schedule
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	mov	sp, r12				@ sp is sp
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+	vld1.8	{q0}, [r8]		@ load counter
+#ifdef	__APPLE__
+	mov	r8, #:lower16:(LREVM0SR-LM0)
+	add	r8, r6, r8
+#else
+	add	r8, r6, #LREVM0SR-LM0	@ borrow r8
+#endif
+	vldmia	sp, {q4}		@ load round0 key
+#else
+	ldr	r12, [r3, #244]
+	eors	r12, #1
+	beq	0f
+
+	@ populate the key schedule
+	str	r12, [r3, #244]
+	mov	r4, r3			@ pass key
+	mov	r5, r10			@ pass # of rounds
+	add	r12, r3, #248			@ pass key schedule
+	bl	_bsaes_key_convert
+	veor	q7,q7,q15	@ fix up last round key
+	vstmia	r12, {q7}			@ save last round key
+
+.align	2
+	add	r12, r3, #248
+	vld1.8	{q0}, [r8]		@ load counter
+	adrl	r8, LREVM0SR			@ borrow r8
+	vldmia	r12, {q4}			@ load round0 key
+	sub	sp, #0x10			@ place for adjusted round0 key
+#endif
+
+	vmov.i32	q8,#1		@ compose 1<<96
+	veor	q9,q9,q9
+	vrev32.8	q0,q0
+	vext.8	q8,q9,q8,#4
+	vrev32.8	q4,q4
+	vadd.u32	q9,q8,q8	@ compose 2<<96
+	vstmia	sp, {q4}		@ save adjusted round0 key
+	b	Lctr_enc_loop
+
+.align	4
+Lctr_enc_loop:
+	vadd.u32	q10, q8, q9	@ compose 3<<96
+	vadd.u32	q1, q0, q8	@ +1
+	vadd.u32	q2, q0, q9	@ +2
+	vadd.u32	q3, q0, q10	@ +3
+	vadd.u32	q4, q1, q10
+	vadd.u32	q5, q2, q10
+	vadd.u32	q6, q3, q10
+	vadd.u32	q7, q4, q10
+	vadd.u32	q10, q5, q10	@ next counter
+
+	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	@ to flip byte order in 32-bit counter
+
+	vldmia	sp, {q9}		@ load round0 key
+#ifndef	BSAES_ASM_EXTENDED_KEY
+	add	r4, sp, #0x10		@ pass next round key
+#else
+	add	r4, r3, #264
+#endif
+	vldmia	r8, {q8}			@ LREVM0SR
+	mov	r5, r10			@ pass rounds
+	vstmia	r9, {q10}			@ save next counter
+#ifdef	__APPLE__
+	mov	r6, #:lower16:(LREVM0SR-LSR)
+	sub	r6, r8, r6
+#else
+	sub	r6, r8, #LREVM0SR-LSR	@ pass constants
+#endif
+
+	bl	_bsaes_encrypt8_alt
+
+	subs	r2, r2, #8
+	blo	Lctr_enc_loop_done
+
+	vld1.8	{q8,q9}, [r0]!	@ load input
+	vld1.8	{q10,q11}, [r0]!
+	veor	q0, q8
+	veor	q1, q9
+	vld1.8	{q12,q13}, [r0]!
+	veor	q4, q10
+	veor	q6, q11
+	vld1.8	{q14,q15}, [r0]!
+	veor	q3, q12
+	vst1.8	{q0,q1}, [r1]!	@ write output
+	veor	q7, q13
+	veor	q2, q14
+	vst1.8	{q4}, [r1]!
+	veor	q5, q15
+	vst1.8	{q6}, [r1]!
+	vmov.i32	q8, #1			@ compose 1<<96
+	vst1.8	{q3}, [r1]!
+	veor	q9, q9, q9
+	vst1.8	{q7}, [r1]!
+	vext.8	q8, q9, q8, #4
+	vst1.8	{q2}, [r1]!
+	vadd.u32	q9,q8,q8		@ compose 2<<96
+	vst1.8	{q5}, [r1]!
+	vldmia	r9, {q0}			@ load counter
+
+	bne	Lctr_enc_loop
+	b	Lctr_enc_done
+
+.align	4
+Lctr_enc_loop_done:
+	add	r2, r2, #8
+	vld1.8	{q8}, [r0]!	@ load input
+	veor	q0, q8
+	vst1.8	{q0}, [r1]!	@ write output
+	cmp	r2, #2
+	blo	Lctr_enc_done
+	vld1.8	{q9}, [r0]!
+	veor	q1, q9
+	vst1.8	{q1}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q10}, [r0]!
+	veor	q4, q10
+	vst1.8	{q4}, [r1]!
+	cmp	r2, #4
+	blo	Lctr_enc_done
+	vld1.8	{q11}, [r0]!
+	veor	q6, q11
+	vst1.8	{q6}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q12}, [r0]!
+	veor	q3, q12
+	vst1.8	{q3}, [r1]!
+	cmp	r2, #6
+	blo	Lctr_enc_done
+	vld1.8	{q13}, [r0]!
+	veor	q7, q13
+	vst1.8	{q7}, [r1]!
+	beq	Lctr_enc_done
+	vld1.8	{q14}, [r0]
+	veor	q2, q14
+	vst1.8	{q2}, [r1]!
+
+Lctr_enc_done:
+	vmov.i32	q0, #0
+	vmov.i32	q1, #0
+#ifndef	BSAES_ASM_EXTENDED_KEY
+Lctr_enc_bzero:@ wipe key schedule [if any]
+	vstmia	sp!, {q0,q1}
+	cmp	sp, r9
+	bne	Lctr_enc_bzero
+#else
+	vstmia	sp, {q0,q1}
+#endif
+
+	mov	sp, r9
+	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
+	VFP_ABI_POP
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
+
+	@ OpenSSL contains aes_nohw_* fallback code here. We patch this
+	@ out to retain a constant-time implementation.
+
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/ghash-armv4.S b/apple-arm/crypto/fipsmodule/ghash-armv4.S
new file mode 100644
index 0000000..36f4cce
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/ghash-armv4.S
@@ -0,0 +1,258 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax	unified
+#define ldrplb  ldrbpl
+#define ldrneb  ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl	_gcm_init_neon
+.private_extern	_gcm_init_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_init_neon
+#endif
+.align	4
+_gcm_init_neon:
+	vld1.64	d7,[r1]!		@ load H
+	vmov.i8	q8,#0xe1
+	vld1.64	d6,[r1]
+	vshl.i64	d17,#57
+	vshr.u64	d16,#63		@ t0=0xc2....01
+	vdup.8	q9,d7[7]
+	vshr.u64	d26,d6,#63
+	vshr.s8	q9,#7			@ broadcast carry bit
+	vshl.i64	q3,q3,#1
+	vand	q8,q8,q9
+	vorr	d7,d26		@ H<<<=1
+	veor	q3,q3,q8		@ twisted H
+	vstmia	r0,{q3}
+
+	bx	lr					@ bx lr
+
+
+.globl	_gcm_gmult_neon
+.private_extern	_gcm_gmult_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_gmult_neon
+#endif
+.align	4
+_gcm_gmult_neon:
+	vld1.64	d7,[r0]!		@ load Xi
+	vld1.64	d6,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+	mov	r3,#16
+	b	Lgmult_neon
+
+
+.globl	_gcm_ghash_neon
+.private_extern	_gcm_ghash_neon
+#ifdef __thumb2__
+.thumb_func	_gcm_ghash_neon
+#endif
+.align	4
+_gcm_ghash_neon:
+	vld1.64	d1,[r0]!		@ load Xi
+	vld1.64	d0,[r0]!
+	vmov.i64	d29,#0x0000ffffffffffff
+	vldmia	r1,{d26,d27}	@ load twisted H
+	vmov.i64	d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	vmov.i64	d31,#0x000000000000ffff
+	veor	d28,d26,d27		@ Karatsuba pre-processing
+
+Loop_neon:
+	vld1.64	d7,[r2]!		@ load inp
+	vld1.64	d6,[r2]!
+#ifdef __ARMEL__
+	vrev64.8	q3,q3
+#endif
+	veor	q3,q0			@ inp^=Xi
+Lgmult_neon:
+	vext.8	d16, d26, d26, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d0, d6, d6, #1	@ B1
+	vmull.p8	q0, d26, d0		@ E = A*B1
+	vext.8	d18, d26, d26, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d26, d22		@ G = A*B2
+	vext.8	d20, d26, d26, #3	@ A3
+	veor	q8, q8, q0		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d0, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q0, d26, d0		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d26, d22		@ K = A*B4
+	veor	q10, q10, q0		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q0, d26, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q0, q0, q8
+	veor	q0, q0, q10
+	veor	d6,d6,d7	@ Karatsuba pre-processing
+	vext.8	d16, d28, d28, #1	@ A1
+	vmull.p8	q8, d16, d6		@ F = A1*B
+	vext.8	d2, d6, d6, #1	@ B1
+	vmull.p8	q1, d28, d2		@ E = A*B1
+	vext.8	d18, d28, d28, #2	@ A2
+	vmull.p8	q9, d18, d6		@ H = A2*B
+	vext.8	d22, d6, d6, #2	@ B2
+	vmull.p8	q11, d28, d22		@ G = A*B2
+	vext.8	d20, d28, d28, #3	@ A3
+	veor	q8, q8, q1		@ L = E + F
+	vmull.p8	q10, d20, d6		@ J = A3*B
+	vext.8	d2, d6, d6, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q1, d28, d2		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d6, d6, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d28, d22		@ K = A*B4
+	veor	q10, q10, q1		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q1, d28, d6		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q1, q1, q8
+	veor	q1, q1, q10
+	vext.8	d16, d27, d27, #1	@ A1
+	vmull.p8	q8, d16, d7		@ F = A1*B
+	vext.8	d4, d7, d7, #1	@ B1
+	vmull.p8	q2, d27, d4		@ E = A*B1
+	vext.8	d18, d27, d27, #2	@ A2
+	vmull.p8	q9, d18, d7		@ H = A2*B
+	vext.8	d22, d7, d7, #2	@ B2
+	vmull.p8	q11, d27, d22		@ G = A*B2
+	vext.8	d20, d27, d27, #3	@ A3
+	veor	q8, q8, q2		@ L = E + F
+	vmull.p8	q10, d20, d7		@ J = A3*B
+	vext.8	d4, d7, d7, #3	@ B3
+	veor	q9, q9, q11		@ M = G + H
+	vmull.p8	q2, d27, d4		@ I = A*B3
+	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
+	vand	d17, d17, d29
+	vext.8	d22, d7, d7, #4	@ B4
+	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
+	vand	d19, d19, d30
+	vmull.p8	q11, d27, d22		@ K = A*B4
+	veor	q10, q10, q2		@ N = I + J
+	veor	d16, d16, d17
+	veor	d18, d18, d19
+	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
+	vand	d21, d21, d31
+	vext.8	q8, q8, q8, #15
+	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	d23, #0
+	vext.8	q9, q9, q9, #14
+	veor	d20, d20, d21
+	vmull.p8	q2, d27, d7		@ D = A*B
+	vext.8	q11, q11, q11, #12
+	vext.8	q10, q10, q10, #13
+	veor	q8, q8, q9
+	veor	q10, q10, q11
+	veor	q2, q2, q8
+	veor	q2, q2, q10
+	veor	q1,q1,q0		@ Karatsuba post-processing
+	veor	q1,q1,q2
+	veor	d1,d1,d2
+	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
+
+	@ equivalent of reduction_avx from ghash-x86_64.pl
+	vshl.i64	q9,q0,#57		@ 1st phase
+	vshl.i64	q10,q0,#62
+	veor	q10,q10,q9		@
+	vshl.i64	q9,q0,#63
+	veor	q10, q10, q9		@
+	veor	d1,d1,d20	@
+	veor	d4,d4,d21
+
+	vshr.u64	q10,q0,#1		@ 2nd phase
+	veor	q2,q2,q0
+	veor	q0,q0,q10		@
+	vshr.u64	q10,q10,#6
+	vshr.u64	q0,q0,#1		@
+	veor	q0,q0,q2		@
+	veor	q0,q0,q10		@
+
+	subs	r3,#16
+	bne	Loop_neon
+
+#ifdef __ARMEL__
+	vrev64.8	q0,q0
+#endif
+	sub	r0,#16
+	vst1.64	d1,[r0]!		@ write out Xi
+	vst1.64	d0,[r0]
+
+	bx	lr					@ bx lr
+
+#endif
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/ghashv8-armx32.S b/apple-arm/crypto/fipsmodule/ghashv8-armx32.S
new file mode 100644
index 0000000..dcac580
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/ghashv8-armx32.S
@@ -0,0 +1,260 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+
+.code	32
+#undef	__thumb2__
+.globl	_gcm_init_v8
+.private_extern	_gcm_init_v8
+#ifdef __thumb2__
+.thumb_func	_gcm_init_v8
+#endif
+.align	4
+_gcm_init_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r1]		@ load input H
+	vmov.i8	q11,#0xe1
+	vshl.i64	q11,q11,#57		@ 0xc2.0
+	vext.8	q3,q9,q9,#8
+	vshr.u64	q10,q11,#63
+	vdup.32	q9,d18[1]
+	vext.8	q8,q10,q11,#8		@ t0=0xc2....01
+	vshr.u64	q10,q3,#63
+	vshr.s32	q9,q9,#31		@ broadcast carry bit
+	vand	q10,q10,q8
+	vshl.i64	q3,q3,#1
+	vext.8	q10,q10,q10,#8
+	vand	q8,q8,q9
+	vorr	q3,q3,q10		@ H<<<=1
+	veor	q12,q3,q8		@ twisted H
+	vst1.64	{q12},[r0]!		@ store Htable[0]
+
+	@ calculate H^2
+	vext.8	q8,q12,q12,#8		@ Karatsuba pre-processing
+.byte	0xa8,0x0e,0xa8,0xf2	@ pmull q0,q12,q12
+	veor	q8,q8,q12
+.byte	0xa9,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q12
+.byte	0xa0,0x2e,0xa0,0xf2	@ pmull q1,q8,q8
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q14,q0,q10
+
+	vext.8	q9,q14,q14,#8		@ Karatsuba pre-processing
+	veor	q9,q9,q14
+	vext.8	q13,q8,q9,#8		@ pack Karatsuba pre-processed
+	vst1.64	{q13,q14},[r0]!	@ store Htable[1..2]
+	bx	lr
+
+.globl	_gcm_gmult_v8
+.private_extern	_gcm_gmult_v8
+#ifdef __thumb2__
+.thumb_func	_gcm_gmult_v8
+#endif
+.align	4
+_gcm_gmult_v8:
+	AARCH64_VALID_CALL_TARGET
+	vld1.64	{q9},[r0]		@ load Xi
+	vmov.i8	q11,#0xe1
+	vld1.64	{q12,q13},[r1]	@ load twisted H, ...
+	vshl.u64	q11,q11,#57
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q3,q9,q9,#8
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	bx	lr
+
+.globl	_gcm_ghash_v8
+.private_extern	_gcm_ghash_v8
+#ifdef __thumb2__
+.thumb_func	_gcm_ghash_v8
+#endif
+.align	4
+_gcm_ghash_v8:
+	AARCH64_VALID_CALL_TARGET
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	vld1.64	{q0},[r0]		@ load [rotated] Xi
+						@ "[rotated]" means that
+						@ loaded value would have
+						@ to be rotated in order to
+						@ make it appear as in
+						@ algorithm specification
+	subs	r3,r3,#32		@ see if r3 is 32 or larger
+	mov	r12,#16		@ r12 is used as post-
+						@ increment for input pointer;
+						@ as loop is modulo-scheduled
+						@ r12 is zeroed just in time
+						@ to preclude overstepping
+						@ inp[len], which means that
+						@ last block[s] are actually
+						@ loaded twice, but last
+						@ copy is not processed
+	vld1.64	{q12,q13},[r1]!	@ load twisted H, ..., H^2
+	vmov.i8	q11,#0xe1
+	vld1.64	{q14},[r1]
+	moveq	r12,#0			@ is it time to zero r12?
+	vext.8	q0,q0,q0,#8		@ rotate Xi
+	vld1.64	{q8},[r2]!	@ load [rotated] I[0]
+	vshl.u64	q11,q11,#57		@ compose 0xc2.0 constant
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+	vrev64.8	q0,q0
+#endif
+	vext.8	q3,q8,q8,#8		@ rotate I[0]
+	blo	Lodd_tail_v8		@ r3 was less than 32
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[1]
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vext.8	q7,q9,q9,#8
+	veor	q3,q3,q0		@ I[i]^=Xi
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	b	Loop_mod2x_v8
+
+.align	4
+Loop_mod2x_v8:
+	vext.8	q10,q3,q3,#8
+	subs	r3,r3,#32		@ is there more data?
+.byte	0x86,0x0e,0xac,0xf2	@ pmull q0,q14,q3		@ H^2.lo·Xi.lo
+	movlo	r12,#0			@ is it time to zero r12?
+
+.byte	0xa2,0xae,0xaa,0xf2	@ pmull q5,q13,q9
+	veor	q10,q10,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xad,0xf2	@ pmull2 q2,q14,q3		@ H^2.hi·Xi.hi
+	veor	q0,q0,q4		@ accumulate
+.byte	0xa5,0x2e,0xab,0xf2	@ pmull2 q1,q13,q10		@ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+	vld1.64	{q8},[r2],r12	@ load [rotated] I[i+2]
+
+	veor	q2,q2,q6
+	moveq	r12,#0			@ is it time to zero r12?
+	veor	q1,q1,q5
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	vld1.64	{q9},[r2],r12	@ load [rotated] I[i+3]
+#ifndef __ARMEB__
+	vrev64.8	q8,q8
+#endif
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+#ifndef __ARMEB__
+	vrev64.8	q9,q9
+#endif
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	vext.8	q7,q9,q9,#8
+	vext.8	q3,q8,q8,#8
+	veor	q0,q1,q10
+.byte	0x8e,0x8e,0xa8,0xf2	@ pmull q4,q12,q7		@ H·Ii+1
+	veor	q3,q3,q2		@ accumulate q3 early
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q3,q3,q10
+	veor	q9,q9,q7		@ Karatsuba pre-processing
+	veor	q3,q3,q0
+.byte	0x8f,0xce,0xa9,0xf2	@ pmull2 q6,q12,q7
+	bhs	Loop_mod2x_v8		@ there was at least 32 more bytes
+
+	veor	q2,q2,q10
+	vext.8	q3,q8,q8,#8		@ re-construct q3
+	adds	r3,r3,#32		@ re-construct r3
+	veor	q0,q0,q2		@ re-construct q0
+	beq	Ldone_v8		@ is r3 zero?
+Lodd_tail_v8:
+	vext.8	q10,q0,q0,#8
+	veor	q3,q3,q0		@ inp^=Xi
+	veor	q9,q8,q10		@ q9 is rotated inp^Xi
+
+.byte	0x86,0x0e,0xa8,0xf2	@ pmull q0,q12,q3		@ H.lo·Xi.lo
+	veor	q9,q9,q3		@ Karatsuba pre-processing
+.byte	0x87,0x4e,0xa9,0xf2	@ pmull2 q2,q12,q3		@ H.hi·Xi.hi
+.byte	0xa2,0x2e,0xaa,0xf2	@ pmull q1,q13,q9		@ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+	vext.8	q9,q0,q2,#8		@ Karatsuba post-processing
+	veor	q10,q0,q2
+	veor	q1,q1,q9
+	veor	q1,q1,q10
+.byte	0x26,0x4e,0xe0,0xf2	@ pmull q10,q0,q11		@ 1st phase of reduction
+
+	vmov	d4,d3		@ Xh|Xm - 256-bit result
+	vmov	d3,d0		@ Xm is rotated Xl
+	veor	q0,q1,q10
+
+	vext.8	q10,q0,q0,#8		@ 2nd phase of reduction
+.byte	0x26,0x0e,0xa0,0xf2	@ pmull q0,q0,q11
+	veor	q10,q10,q2
+	veor	q0,q0,q10
+
+Ldone_v8:
+#ifndef __ARMEB__
+	vrev64.8	q0,q0
+#endif
+	vext.8	q0,q0,q0,#8
+	vst1.64	{q0},[r0]		@ write out Xi
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ 32-bit ABI says so
+	bx	lr
+
+.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha1-armv4-large.S b/apple-arm/crypto/fipsmodule/sha1-armv4-large.S
new file mode 100644
index 0000000..82ac8df
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha1-armv4-large.S
@@ -0,0 +1,1518 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+.globl	_sha1_block_data_order
+.private_extern	_sha1_block_data_order
+#ifdef __thumb2__
+.thumb_func	_sha1_block_data_order
+#endif
+
+.align	5
+_sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+Lsha1_block:
+	adr	r3,Lsha1_block
+	ldr	r12,LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA1
+	bne	LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	LNEON
+#endif
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	ldmia	r0,{r3,r4,r5,r6,r7}
+Lloop:
+	ldr	r8,LK_00_19
+	mov	r14,sp
+	sub	sp,sp,#15*4
+	mov	r5,r5,ror#30
+	mov	r6,r6,ror#30
+	mov	r7,r7,ror#30		@ [6]
+L_00_15:
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r4,r5			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r6,r8,r6,ror#2			@ E+=K_00_19
+	eor	r10,r4,r5			@ F_xx_xx
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r3,r10,ror#2
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r3,r4			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r5,r8,r5,ror#2			@ E+=K_00_19
+	eor	r10,r3,r4			@ F_xx_xx
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r7,r10,ror#2
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r7,r3			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r4,r8,r4,ror#2			@ E+=K_00_19
+	eor	r10,r7,r3			@ F_xx_xx
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r6,r10,ror#2
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r6,r7			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r3,r8,r3,ror#2			@ E+=K_00_19
+	eor	r10,r6,r7			@ F_xx_xx
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r5,r10,ror#2
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	L_00_15		@ [((11+4)*5+2)*3]
+	sub	sp,sp,#25*4
+#if __ARM_ARCH__<7
+	ldrb	r10,[r1,#2]
+	ldrb	r9,[r1,#3]
+	ldrb	r11,[r1,#1]
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	ldrb	r12,[r1],#4
+	orr	r9,r9,r10,lsl#8
+	eor	r10,r5,r6			@ F_xx_xx
+	orr	r9,r9,r11,lsl#16
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	orr	r9,r9,r12,lsl#24
+#else
+	ldr	r9,[r1],#4			@ handles unaligned
+	add	r7,r8,r7,ror#2			@ E+=K_00_19
+	eor	r10,r5,r6			@ F_xx_xx
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	r9,r9				@ byte swap
+#endif
+#endif
+	and	r10,r4,r10,ror#2
+	add	r7,r7,r9			@ E+=X[i]
+	eor	r10,r10,r6,ror#2		@ F_00_19(B,C,D)
+	str	r9,[r14,#-4]!
+	add	r7,r7,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	eor	r10,r10,r5,ror#2		@ F_00_19(B,C,D)
+	add	r6,r6,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	eor	r10,r10,r4,ror#2		@ F_00_19(B,C,D)
+	add	r5,r5,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	eor	r10,r10,r3,ror#2		@ F_00_19(B,C,D)
+	add	r4,r4,r10			@ E+=F_00_19(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	eor	r10,r10,r7,ror#2		@ F_00_19(B,C,D)
+	add	r3,r3,r10			@ E+=F_00_19(B,C,D)
+
+	ldr	r8,LK_20_39		@ [+15+16*4]
+	cmn	sp,#0			@ [+3], clear carry to denote 20_39
+L_20_39_or_60_79:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r4,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r3,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r7,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r6,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_20_39(B,C,D)
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	eor	r10,r5,r10,ror#2					@ F_xx_xx
+						@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp			@ preserve carry
+#endif
+	bne	L_20_39_or_60_79	@ [+((12+3)*5+2)*4]
+	bcs	L_done			@ [+((12+3)*5+2)*4], spare 300 bytes
+
+	ldr	r8,LK_40_59
+	sub	sp,sp,#20*4		@ [+2]
+L_40_59:
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r7,r8,r7,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r5,r6			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r7,r7,r3,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r4,r10,ror#2					@ F_xx_xx
+	and	r11,r5,r6					@ F_xx_xx
+	add	r7,r7,r9			@ E+=X[i]
+	add	r7,r7,r10			@ E+=F_40_59(B,C,D)
+	add	r7,r7,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r6,r8,r6,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r4,r5			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r6,r6,r7,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r3,r10,ror#2					@ F_xx_xx
+	and	r11,r4,r5					@ F_xx_xx
+	add	r6,r6,r9			@ E+=X[i]
+	add	r6,r6,r10			@ E+=F_40_59(B,C,D)
+	add	r6,r6,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r5,r8,r5,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r3,r4			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r5,r5,r6,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r7,r10,ror#2					@ F_xx_xx
+	and	r11,r3,r4					@ F_xx_xx
+	add	r5,r5,r9			@ E+=X[i]
+	add	r5,r5,r10			@ E+=F_40_59(B,C,D)
+	add	r5,r5,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r4,r8,r4,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r7,r3			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r4,r4,r5,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r6,r10,ror#2					@ F_xx_xx
+	and	r11,r7,r3					@ F_xx_xx
+	add	r4,r4,r9			@ E+=X[i]
+	add	r4,r4,r10			@ E+=F_40_59(B,C,D)
+	add	r4,r4,r11,ror#2
+	ldr	r9,[r14,#15*4]
+	ldr	r10,[r14,#13*4]
+	ldr	r11,[r14,#7*4]
+	add	r3,r8,r3,ror#2			@ E+=K_xx_xx
+	ldr	r12,[r14,#2*4]
+	eor	r9,r9,r10
+	eor	r11,r11,r12			@ 1 cycle stall
+	eor	r10,r6,r7			@ F_xx_xx
+	mov	r9,r9,ror#31
+	add	r3,r3,r4,ror#27			@ E+=ROR(A,27)
+	eor	r9,r9,r11,ror#31
+	str	r9,[r14,#-4]!
+	and	r10,r5,r10,ror#2					@ F_xx_xx
+	and	r11,r6,r7					@ F_xx_xx
+	add	r3,r3,r9			@ E+=X[i]
+	add	r3,r3,r10			@ E+=F_40_59(B,C,D)
+	add	r3,r3,r11,ror#2
+#if defined(__thumb2__)
+	mov	r12,sp
+	teq	r14,r12
+#else
+	teq	r14,sp
+#endif
+	bne	L_40_59		@ [+((12+5)*5+2)*4]
+
+	ldr	r8,LK_60_79
+	sub	sp,sp,#20*4
+	cmp	sp,#0			@ set carry to denote 60_79
+	b	L_20_39_or_60_79	@ [+4], spare 300 bytes
+L_done:
+	add	sp,sp,#80*4		@ "deallocate" stack frame
+	ldmia	r0,{r8,r9,r10,r11,r12}
+	add	r3,r8,r3
+	add	r4,r9,r4
+	add	r5,r10,r5,ror#2
+	add	r6,r11,r6,ror#2
+	add	r7,r12,r7,ror#2
+	stmia	r0,{r3,r4,r5,r6,r7}
+	teq	r1,r2
+	bne	Lloop			@ [+18], total 1307
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+
+.align	5
+LK_00_19:.word	0x5a827999
+LK_20_39:.word	0x6ed9eba1
+LK_40_59:.word	0x8f1bbcdc
+LK_60_79:.word	0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-Lsha1_block
+#endif
+.byte	83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	5
+#if __ARM_MAX_ARCH__>=7
+
+
+
+#ifdef __thumb2__
+.thumb_func	sha1_block_data_order_neon
+#endif
+.align	4
+sha1_block_data_order_neon:
+LNEON:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	add	r2,r1,r2,lsl#6	@ r2 to point at the end of r1
+	@ dmb				@ errata #451034 on early Cortex A8
+	@ vstmdb	sp!,{d8-d15}	@ ABI specification says so
+	mov	r14,sp
+	sub	r12,sp,#64
+	adr	r8,LK_00_19
+	bic	r12,r12,#15		@ align for 128-bit stores
+
+	ldmia	r0,{r3,r4,r5,r6,r7}	@ load context
+	mov	sp,r12		@ alloca
+
+	vld1.8	{q0,q1},[r1]!	@ handles unaligned
+	veor	q15,q15,q15
+	vld1.8	{q2,q3},[r1]!
+	vld1.32	{d28[],d29[]},[r8,:32]!	@ load K_00_19
+	vrev32.8	q0,q0		@ yes, even on
+	vrev32.8	q1,q1		@ big-endian...
+	vrev32.8	q2,q2
+	vadd.i32	q8,q0,q14
+	vrev32.8	q3,q3
+	vadd.i32	q9,q1,q14
+	vst1.32	{q8},[r12,:128]!
+	vadd.i32	q10,q2,q14
+	vst1.32	{q9},[r12,:128]!
+	vst1.32	{q10},[r12,:128]!
+	ldr	r9,[sp]			@ big RAW stall
+
+Loop_neon:
+	vext.8	q8,q0,q1,#8
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	and	r11,r5,r4
+	vadd.i32	q13,q3,q14
+	ldr	r9,[sp,#4]
+	add	r7,r7,r3,ror#27
+	vext.8	q12,q3,q15,#4
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q8,q8,q0
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	veor	q12,q12,q2
+	and	r11,r4,r3
+	ldr	r9,[sp,#8]
+	veor	q12,q12,q8
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	vadd.i32	q8,q12,q12
+	and	r11,r3,r7
+	ldr	r9,[sp,#12]
+	vsri.32	q8,q12,#31
+	add	r5,r5,r6,ror#27
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vshr.u32	q12,q13,#30
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	vshl.u32	q13,q13,#2
+	add	r4,r4,r9
+	and	r11,r7,r6
+	veor	q8,q8,q12
+	ldr	r9,[sp,#16]
+	add	r4,r4,r5,ror#27
+	veor	q8,q8,q13
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q9,q1,q2,#8
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	vadd.i32	q13,q8,q14
+	ldr	r9,[sp,#20]
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r4,ror#27
+	vext.8	q12,q8,q15,#4
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	veor	q9,q9,q1
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	veor	q12,q12,q3
+	and	r11,r5,r4
+	ldr	r9,[sp,#24]
+	veor	q12,q12,q9
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	vadd.i32	q9,q12,q12
+	and	r11,r4,r3
+	ldr	r9,[sp,#28]
+	vsri.32	q9,q12,#31
+	add	r6,r6,r7,ror#27
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vshr.u32	q12,q13,#30
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	vshl.u32	q13,q13,#2
+	add	r5,r5,r9
+	and	r11,r3,r7
+	veor	q9,q9,q12
+	ldr	r9,[sp,#32]
+	add	r5,r5,r6,ror#27
+	veor	q9,q9,q13
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q10,q2,q3,#8
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	vadd.i32	q13,q9,q14
+	ldr	r9,[sp,#36]
+	add	r4,r4,r5,ror#27
+	vext.8	q12,q9,q15,#4
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q10,q10,q2
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	veor	q12,q12,q8
+	and	r11,r6,r5
+	ldr	r9,[sp,#40]
+	veor	q12,q12,q10
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r6,r4
+	add	r7,r7,r9
+	vadd.i32	q10,q12,q12
+	and	r11,r5,r4
+	ldr	r9,[sp,#44]
+	vsri.32	q10,q12,#31
+	add	r7,r7,r3,ror#27
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	vshr.u32	q12,q13,#30
+	add	r7,r7,r11
+	bic	r10,r5,r3
+	vshl.u32	q13,q13,#2
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q10,q10,q12
+	ldr	r9,[sp,#48]
+	add	r6,r6,r7,ror#27
+	veor	q10,q10,q13
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q11,q3,q8,#8
+	bic	r10,r4,r7
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vadd.i32	q13,q10,q14
+	ldr	r9,[sp,#52]
+	add	r5,r5,r6,ror#27
+	vext.8	q12,q10,q15,#4
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q11,q11,q3
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	veor	q12,q12,q9
+	and	r11,r7,r6
+	ldr	r9,[sp,#56]
+	veor	q12,q12,q11
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	vst1.32	{q13},[r12,:128]!
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q13,q15,q12,#4
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	vadd.i32	q11,q12,q12
+	and	r11,r6,r5
+	ldr	r9,[sp,#60]
+	vsri.32	q11,q12,#31
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	vshr.u32	q12,q13,#30
+	add	r3,r3,r11
+	bic	r10,r6,r4
+	vshl.u32	q13,q13,#2
+	add	r7,r7,r9
+	and	r11,r5,r4
+	veor	q11,q11,q12
+	ldr	r9,[sp,#0]
+	add	r7,r7,r3,ror#27
+	veor	q11,q11,q13
+	eor	r11,r11,r10
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q10,q11,#8
+	bic	r10,r5,r3
+	add	r6,r6,r9
+	and	r11,r4,r3
+	veor	q0,q0,q8
+	ldr	r9,[sp,#4]
+	add	r6,r6,r7,ror#27
+	veor	q0,q0,q1
+	eor	r11,r11,r10
+	mov	r3,r3,ror#2
+	vadd.i32	q13,q11,q14
+	add	r6,r6,r11
+	bic	r10,r4,r7
+	veor	q12,q12,q0
+	add	r5,r5,r9
+	and	r11,r3,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r11,r10
+	mov	r7,r7,ror#2
+	vsli.32	q0,q12,#2
+	add	r5,r5,r11
+	bic	r10,r3,r6
+	add	r4,r4,r9
+	and	r11,r7,r6
+	ldr	r9,[sp,#12]
+	add	r4,r4,r5,ror#27
+	eor	r11,r11,r10
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	bic	r10,r7,r5
+	add	r3,r3,r9
+	and	r11,r6,r5
+	ldr	r9,[sp,#16]
+	add	r3,r3,r4,ror#27
+	eor	r11,r11,r10
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q11,q0,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q1,q1,q2
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q0,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q1
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r4
+	vshr.u32	q1,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q1,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q0,q1,#8
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	veor	q2,q2,q3
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vadd.i32	q13,q1,q14
+	eor	r10,r4,r6
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r7,r7,r9
+	veor	q12,q12,q2
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r5
+	vshr.u32	q2,q12,#30
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	vsli.32	q2,q12,#2
+	add	r6,r6,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	veor	q3,q3,q8
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r6
+	vshr.u32	q3,q12,#30
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vsli.32	q3,q12,#2
+	add	r7,r7,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q2,q3,#8
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#4]
+	veor	q8,q8,q0
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	veor	q8,q8,q9
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	veor	q12,q12,q8
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r7
+	vshr.u32	q8,q12,#30
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	vsli.32	q8,q12,#2
+	add	r3,r3,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q3,q8,#8
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#20]
+	veor	q9,q9,q1
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	veor	q9,q9,q10
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vadd.i32	q13,q8,q14
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	veor	q12,q12,q9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r3
+	vshr.u32	q9,q12,#30
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	vsli.32	q9,q12,#2
+	add	r4,r4,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q8,q9,#8
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#36]
+	veor	q10,q10,q2
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	veor	q10,q10,q11
+	add	r7,r7,r10
+	and	r11,r11,r4
+	vadd.i32	q13,q9,q14
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	veor	q12,q12,q10
+	add	r6,r6,r9
+	and	r10,r4,r5
+	vshr.u32	q10,q12,#30
+	ldr	r9,[sp,#40]
+	add	r6,r6,r7,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	vsli.32	q10,q12,#2
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#44]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#48]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vext.8	q12,q9,q10,#8
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#52]
+	veor	q11,q11,q3
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	veor	q11,q11,q0
+	add	r3,r3,r10
+	and	r11,r11,r5
+	vadd.i32	q13,q10,q14
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	veor	q12,q12,q11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	vshr.u32	q11,q12,#30
+	ldr	r9,[sp,#56]
+	add	r7,r7,r3,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	vsli.32	q11,q12,#2
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#60]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#0]
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vext.8	q12,q10,q11,#8
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#4]
+	veor	q0,q0,q8
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	veor	q0,q0,q1
+	add	r4,r4,r10
+	and	r11,r11,r6
+	vadd.i32	q13,q11,q14
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	veor	q12,q12,q0
+	add	r3,r3,r9
+	and	r10,r6,r7
+	vshr.u32	q0,q12,#30
+	ldr	r9,[sp,#8]
+	add	r3,r3,r4,ror#27
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	vsli.32	q0,q12,#2
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#12]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#16]
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	add	r6,r6,r10
+	and	r11,r11,r3
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vext.8	q12,q11,q0,#8
+	add	r5,r5,r9
+	and	r10,r3,r4
+	ldr	r9,[sp,#20]
+	veor	q1,q1,q9
+	add	r5,r5,r6,ror#27
+	eor	r11,r3,r4
+	veor	q1,q1,q2
+	add	r5,r5,r10
+	and	r11,r11,r7
+	vadd.i32	q13,q0,q14
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	veor	q12,q12,q1
+	add	r4,r4,r9
+	and	r10,r7,r3
+	vshr.u32	q1,q12,#30
+	ldr	r9,[sp,#24]
+	add	r4,r4,r5,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	vsli.32	q1,q12,#2
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#28]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	add	r7,r7,r9
+	and	r10,r5,r6
+	ldr	r9,[sp,#32]
+	add	r7,r7,r3,ror#27
+	eor	r11,r5,r6
+	add	r7,r7,r10
+	and	r11,r11,r4
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vext.8	q12,q0,q1,#8
+	add	r6,r6,r9
+	and	r10,r4,r5
+	ldr	r9,[sp,#36]
+	veor	q2,q2,q10
+	add	r6,r6,r7,ror#27
+	eor	r11,r4,r5
+	veor	q2,q2,q3
+	add	r6,r6,r10
+	and	r11,r11,r3
+	vadd.i32	q13,q1,q14
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	veor	q12,q12,q2
+	add	r5,r5,r9
+	and	r10,r3,r4
+	vshr.u32	q2,q12,#30
+	ldr	r9,[sp,#40]
+	add	r5,r5,r6,ror#27
+	vst1.32	{q13},[r12,:128]!
+	eor	r11,r3,r4
+	add	r5,r5,r10
+	vsli.32	q2,q12,#2
+	and	r11,r11,r7
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	add	r4,r4,r9
+	and	r10,r7,r3
+	ldr	r9,[sp,#44]
+	add	r4,r4,r5,ror#27
+	eor	r11,r7,r3
+	add	r4,r4,r10
+	and	r11,r11,r6
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	add	r3,r3,r9
+	and	r10,r6,r7
+	ldr	r9,[sp,#48]
+	add	r3,r3,r4,ror#27
+	eor	r11,r6,r7
+	add	r3,r3,r10
+	and	r11,r11,r5
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	vext.8	q12,q1,q2,#8
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#52]
+	veor	q3,q3,q11
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	veor	q3,q3,q8
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vadd.i32	q13,q2,q14
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	veor	q12,q12,q3
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r4
+	vshr.u32	q3,q12,#30
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	vst1.32	{q13},[r12,:128]!
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	vsli.32	q3,q12,#2
+	add	r5,r5,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#0]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	vadd.i32	q13,q3,q14
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	vst1.32	{q13},[r12,:128]!
+	sub	r12,r12,#64
+	teq	r1,r2
+	sub	r8,r8,#16
+	it	eq
+	subeq	r1,r1,#64
+	vld1.8	{q0,q1},[r1]!
+	ldr	r9,[sp,#4]
+	eor	r11,r10,r6
+	vld1.8	{q2,q3},[r1]!
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	vld1.32	{d28[],d29[]},[r8,:32]!
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	vrev32.8	q0,q0
+	add	r7,r7,r9
+	ldr	r9,[sp,#8]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#12]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#16]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	vrev32.8	q1,q1
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	vadd.i32	q8,q0,q14
+	ldr	r9,[sp,#20]
+	eor	r11,r10,r7
+	vst1.32	{q8},[r12,:128]!
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#24]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#28]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	ldr	r9,[sp,#32]
+	eor	r11,r10,r4
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	vrev32.8	q2,q2
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	vadd.i32	q9,q1,q14
+	ldr	r9,[sp,#36]
+	eor	r11,r10,r3
+	vst1.32	{q9},[r12,:128]!
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#40]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	ldr	r9,[sp,#44]
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	eor	r10,r4,r6
+	add	r7,r7,r9
+	ldr	r9,[sp,#48]
+	eor	r11,r10,r5
+	add	r7,r7,r3,ror#27
+	mov	r4,r4,ror#2
+	add	r7,r7,r11
+	vrev32.8	q3,q3
+	eor	r10,r3,r5
+	add	r6,r6,r9
+	vadd.i32	q10,q2,q14
+	ldr	r9,[sp,#52]
+	eor	r11,r10,r4
+	vst1.32	{q10},[r12,:128]!
+	add	r6,r6,r7,ror#27
+	mov	r3,r3,ror#2
+	add	r6,r6,r11
+	eor	r10,r7,r4
+	add	r5,r5,r9
+	ldr	r9,[sp,#56]
+	eor	r11,r10,r3
+	add	r5,r5,r6,ror#27
+	mov	r7,r7,ror#2
+	add	r5,r5,r11
+	eor	r10,r6,r3
+	add	r4,r4,r9
+	ldr	r9,[sp,#60]
+	eor	r11,r10,r7
+	add	r4,r4,r5,ror#27
+	mov	r6,r6,ror#2
+	add	r4,r4,r11
+	eor	r10,r5,r7
+	add	r3,r3,r9
+	eor	r11,r10,r6
+	add	r3,r3,r4,ror#27
+	mov	r5,r5,ror#2
+	add	r3,r3,r11
+	ldmia	r0,{r9,r10,r11,r12}	@ accumulate context
+	add	r3,r3,r9
+	ldr	r9,[r0,#16]
+	add	r4,r4,r10
+	add	r5,r5,r11
+	add	r6,r6,r12
+	it	eq
+	moveq	sp,r14
+	add	r7,r7,r9
+	it	ne
+	ldrne	r9,[sp]
+	stmia	r0,{r3,r4,r5,r6,r7}
+	itt	ne
+	addne	r12,sp,#3*16
+	bne	Loop_neon
+
+	@ vldmia	sp!,{d8-d15}
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d|0x10
+# endif
+
+#ifdef __thumb2__
+.thumb_func	sha1_block_data_order_armv8
+#endif
+.align	5
+sha1_block_data_order_armv8:
+LARMv8:
+	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
+
+	veor	q1,q1,q1
+	adr	r3,LK_00_19
+	vld1.32	{q0},[r0]!
+	vld1.32	{d2[0]},[r0]
+	sub	r0,r0,#16
+	vld1.32	{d16[],d17[]},[r3,:32]!
+	vld1.32	{d18[],d19[]},[r3,:32]!
+	vld1.32	{d20[],d21[]},[r3,:32]!
+	vld1.32	{d22[],d23[]},[r3,:32]
+
+Loop_v8:
+	vld1.8	{q4,q5},[r1]!
+	vld1.8	{q6,q7},[r1]!
+	vrev32.8	q4,q4
+	vrev32.8	q5,q5
+
+	vadd.i32	q12,q8,q4
+	vrev32.8	q6,q6
+	vmov	q14,q0	@ offload
+	subs	r2,r2,#1
+
+	vadd.i32	q13,q8,q5
+	vrev32.8	q7,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 0
+	INST(0x68,0x0c,0x02,0xe2)	@ sha1c q0,q1,q12
+	vadd.i32	q12,q8,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 1
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q8,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 2
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q8,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 3
+	INST(0x6a,0x0c,0x06,0xe2)	@ sha1c q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 4
+	INST(0x68,0x0c,0x04,0xe2)	@ sha1c q0,q2,q12
+	vadd.i32	q12,q9,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 5
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 6
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q9,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 7
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q9,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 8
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 9
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q10,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 10
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 11
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q10,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 12
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q10,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0x4c,0x8c,0x3a,0xe2)	@ sha1su0 q4,q5,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 13
+	INST(0x6a,0x0c,0x26,0xe2)	@ sha1m q0,q3,q13
+	vadd.i32	q13,q11,q7
+	INST(0x8e,0x83,0xba,0xf3)	@ sha1su1 q4,q7
+	INST(0x4e,0xac,0x3c,0xe2)	@ sha1su0 q5,q6,q7
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 14
+	INST(0x68,0x0c,0x24,0xe2)	@ sha1m q0,q2,q12
+	vadd.i32	q12,q11,q4
+	INST(0x88,0xa3,0xba,0xf3)	@ sha1su1 q5,q4
+	INST(0x48,0xcc,0x3e,0xe2)	@ sha1su0 q6,q7,q4
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 15
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q5
+	INST(0x8a,0xc3,0xba,0xf3)	@ sha1su1 q6,q5
+	INST(0x4a,0xec,0x38,0xe2)	@ sha1su0 q7,q4,q5
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 16
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+	vadd.i32	q12,q11,q6
+	INST(0x8c,0xe3,0xba,0xf3)	@ sha1su1 q7,q6
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 17
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+	vadd.i32	q13,q11,q7
+
+	INST(0xc0,0x62,0xb9,0xf3)	@ sha1h q3,q0		@ 18
+	INST(0x68,0x0c,0x14,0xe2)	@ sha1p q0,q2,q12
+
+	INST(0xc0,0x42,0xb9,0xf3)	@ sha1h q2,q0		@ 19
+	INST(0x6a,0x0c,0x16,0xe2)	@ sha1p q0,q3,q13
+
+	vadd.i32	q1,q1,q2
+	vadd.i32	q0,q0,q14
+	bne	Loop_v8
+
+	vst1.32	{q0},[r0]!
+	vst1.32	{d2[0]},[r0]
+
+	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+	bx	lr					@ bx lr
+
+#endif
+#if __ARM_MAX_ARCH__>=7
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha256-armv4.S b/apple-arm/crypto/fipsmodule/sha256-armv4.S
new file mode 100644
index 0000000..0cf3648
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha256-armv4.S
@@ -0,0 +1,2846 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+#else
+.code	32
+#endif
+
+
+.align	5
+K256:
+.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.word	0				@ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-Lsha256_block_data_order
+#endif
+.align	5
+
+.globl	_sha256_block_data_order
+.private_extern	_sha256_block_data_order
+#ifdef __thumb2__
+.thumb_func	_sha256_block_data_order
+#endif
+_sha256_block_data_order:
+Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r3,pc,#8		@ _sha256_block_data_order
+#else
+	adr	r3,Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV8_SHA256
+	bne	LARMv8
+	tst	r12,#ARMV7_NEON
+	bne	LNEON
+#endif
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r14,r3,#256+32	@ K256
+	sub	sp,sp,#16*4		@ alloca(X[16])
+Loop:
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6		@ magic
+	eor	r12,r12,r12
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 0
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 0
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 0==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 0==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 1
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 1
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 1==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 1==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 2
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 2
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 2==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 2==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 3
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 3
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 3==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 3==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 4
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 4
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 4==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 4==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 5
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 5==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 5==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 6
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 6
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 6==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 6==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 7
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 7==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 7==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 8
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 8
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 8==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r8,r8,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 8==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 9
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 9
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 9==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r7,r7,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 9==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 10
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 10
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 10==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r6,r6,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 10==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 11
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 11
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 11==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r5,r5,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 11==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 12
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 12
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 12==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r4,r4,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 12==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 13
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 13
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 13==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r11,r11,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 13==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 14
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 14
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	ldrb	r12,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r12,lsl#8
+	ldrb	r12,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 14==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r10,r10,ror#5
+	orr	r2,r2,r12,lsl#24
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+#endif
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 14==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	@ ldr	r2,[r1],#4			@ 15
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+# ifndef __ARMEB__
+	rev	r2,r2
+# endif
+#else
+	@ ldrb	r2,[r1,#3]			@ 15
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	ldrb	r3,[r1,#2]
+	ldrb	r0,[r1,#1]
+	orr	r2,r2,r3,lsl#8
+	ldrb	r3,[r1],#4
+	orr	r2,r2,r0,lsl#16
+# if 15==15
+	str	r1,[sp,#17*4]			@ make room for r1
+# endif
+	eor	r0,r9,r9,ror#5
+	orr	r2,r2,r3,lsl#24
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+#endif
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 15==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+Lrounds_16_xx:
+	@ ldr	r2,[sp,#1*4]		@ 16
+	@ ldr	r1,[sp,#14*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#0*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#9*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#0*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 16==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#2*4]		@ 17
+	@ ldr	r1,[sp,#15*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#1*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#10*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#1*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 17==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#3*4]		@ 18
+	@ ldr	r1,[sp,#0*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#2*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#11*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#2*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 18==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#4*4]		@ 19
+	@ ldr	r1,[sp,#1*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#3*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#12*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#3*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 19==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#5*4]		@ 20
+	@ ldr	r1,[sp,#2*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#4*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#13*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#4*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 20==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#6*4]		@ 21
+	@ ldr	r1,[sp,#3*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#5*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#14*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#5*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 21==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#7*4]		@ 22
+	@ ldr	r1,[sp,#4*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#6*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#15*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#6*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 22==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#8*4]		@ 23
+	@ ldr	r1,[sp,#5*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#7*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#0*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#7*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 23==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#9*4]		@ 24
+	@ ldr	r1,[sp,#6*4]
+	mov	r0,r2,ror#7
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#8*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#1*4]
+
+	add	r12,r12,r0
+	eor	r0,r8,r8,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r8,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r11,r11,r2			@ h+=X[i]
+	str	r2,[sp,#8*4]
+	eor	r2,r9,r10
+	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r8
+	add	r11,r11,r12			@ h+=K256[i]
+	eor	r2,r2,r10			@ Ch(e,f,g)
+	eor	r0,r4,r4,ror#11
+	add	r11,r11,r2			@ h+=Ch(e,f,g)
+#if 24==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r4,r5			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
+	eor	r12,r4,r5			@ a^b, b^c in next round
+	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r4,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r7,r7,r11			@ d+=h
+	eor	r3,r3,r5			@ Maj(a,b,c)
+	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#10*4]		@ 25
+	@ ldr	r1,[sp,#7*4]
+	mov	r0,r2,ror#7
+	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#9*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#2*4]
+
+	add	r3,r3,r0
+	eor	r0,r7,r7,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r7,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r10,r10,r2			@ h+=X[i]
+	str	r2,[sp,#9*4]
+	eor	r2,r8,r9
+	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r7
+	add	r10,r10,r3			@ h+=K256[i]
+	eor	r2,r2,r9			@ Ch(e,f,g)
+	eor	r0,r11,r11,ror#11
+	add	r10,r10,r2			@ h+=Ch(e,f,g)
+#if 25==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r11,r4			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
+	eor	r3,r11,r4			@ a^b, b^c in next round
+	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r11,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r6,r6,r10			@ d+=h
+	eor	r12,r12,r4			@ Maj(a,b,c)
+	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#11*4]		@ 26
+	@ ldr	r1,[sp,#8*4]
+	mov	r0,r2,ror#7
+	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#10*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#3*4]
+
+	add	r12,r12,r0
+	eor	r0,r6,r6,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r6,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r9,r9,r2			@ h+=X[i]
+	str	r2,[sp,#10*4]
+	eor	r2,r7,r8
+	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r6
+	add	r9,r9,r12			@ h+=K256[i]
+	eor	r2,r2,r8			@ Ch(e,f,g)
+	eor	r0,r10,r10,ror#11
+	add	r9,r9,r2			@ h+=Ch(e,f,g)
+#if 26==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r10,r11			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
+	eor	r12,r10,r11			@ a^b, b^c in next round
+	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r10,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r5,r5,r9			@ d+=h
+	eor	r3,r3,r11			@ Maj(a,b,c)
+	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#12*4]		@ 27
+	@ ldr	r1,[sp,#9*4]
+	mov	r0,r2,ror#7
+	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#11*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#4*4]
+
+	add	r3,r3,r0
+	eor	r0,r5,r5,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r5,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r8,r8,r2			@ h+=X[i]
+	str	r2,[sp,#11*4]
+	eor	r2,r6,r7
+	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r5
+	add	r8,r8,r3			@ h+=K256[i]
+	eor	r2,r2,r7			@ Ch(e,f,g)
+	eor	r0,r9,r9,ror#11
+	add	r8,r8,r2			@ h+=Ch(e,f,g)
+#if 27==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r9,r10			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
+	eor	r3,r9,r10			@ a^b, b^c in next round
+	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r9,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r4,r4,r8			@ d+=h
+	eor	r12,r12,r10			@ Maj(a,b,c)
+	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#13*4]		@ 28
+	@ ldr	r1,[sp,#10*4]
+	mov	r0,r2,ror#7
+	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#12*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#5*4]
+
+	add	r12,r12,r0
+	eor	r0,r4,r4,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r4,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r7,r7,r2			@ h+=X[i]
+	str	r2,[sp,#12*4]
+	eor	r2,r5,r6
+	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r4
+	add	r7,r7,r12			@ h+=K256[i]
+	eor	r2,r2,r6			@ Ch(e,f,g)
+	eor	r0,r8,r8,ror#11
+	add	r7,r7,r2			@ h+=Ch(e,f,g)
+#if 28==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r8,r9			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
+	eor	r12,r8,r9			@ a^b, b^c in next round
+	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r8,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r11,r11,r7			@ d+=h
+	eor	r3,r3,r9			@ Maj(a,b,c)
+	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#14*4]		@ 29
+	@ ldr	r1,[sp,#11*4]
+	mov	r0,r2,ror#7
+	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#13*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#6*4]
+
+	add	r3,r3,r0
+	eor	r0,r11,r11,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r11,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r6,r6,r2			@ h+=X[i]
+	str	r2,[sp,#13*4]
+	eor	r2,r4,r5
+	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r11
+	add	r6,r6,r3			@ h+=K256[i]
+	eor	r2,r2,r5			@ Ch(e,f,g)
+	eor	r0,r7,r7,ror#11
+	add	r6,r6,r2			@ h+=Ch(e,f,g)
+#if 29==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r7,r8			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
+	eor	r3,r7,r8			@ a^b, b^c in next round
+	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r7,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r10,r10,r6			@ d+=h
+	eor	r12,r12,r8			@ Maj(a,b,c)
+	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#15*4]		@ 30
+	@ ldr	r1,[sp,#12*4]
+	mov	r0,r2,ror#7
+	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
+	mov	r12,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r12,r12,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#14*4]
+	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#7*4]
+
+	add	r12,r12,r0
+	eor	r0,r10,r10,ror#5	@ from BODY_00_15
+	add	r2,r2,r12
+	eor	r0,r0,r10,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r12,[r14],#4			@ *K256++
+	add	r5,r5,r2			@ h+=X[i]
+	str	r2,[sp,#14*4]
+	eor	r2,r11,r4
+	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r10
+	add	r5,r5,r12			@ h+=K256[i]
+	eor	r2,r2,r4			@ Ch(e,f,g)
+	eor	r0,r6,r6,ror#11
+	add	r5,r5,r2			@ h+=Ch(e,f,g)
+#if 30==31
+	and	r12,r12,#0xff
+	cmp	r12,#0xf2			@ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r12,r6,r7			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
+	eor	r12,r6,r7			@ a^b, b^c in next round
+	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r6,ror#20	@ Sigma0(a)
+	and	r3,r3,r12			@ (b^c)&=(a^b)
+	add	r9,r9,r5			@ d+=h
+	eor	r3,r3,r7			@ Maj(a,b,c)
+	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
+	@ ldr	r2,[sp,#0*4]		@ 31
+	@ ldr	r1,[sp,#13*4]
+	mov	r0,r2,ror#7
+	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
+	mov	r3,r1,ror#17
+	eor	r0,r0,r2,ror#18
+	eor	r3,r3,r1,ror#19
+	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
+	ldr	r2,[sp,#15*4]
+	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
+	ldr	r1,[sp,#8*4]
+
+	add	r3,r3,r0
+	eor	r0,r9,r9,ror#5	@ from BODY_00_15
+	add	r2,r2,r3
+	eor	r0,r0,r9,ror#19	@ Sigma1(e)
+	add	r2,r2,r1			@ X[i]
+	ldr	r3,[r14],#4			@ *K256++
+	add	r4,r4,r2			@ h+=X[i]
+	str	r2,[sp,#15*4]
+	eor	r2,r10,r11
+	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
+	and	r2,r2,r9
+	add	r4,r4,r3			@ h+=K256[i]
+	eor	r2,r2,r11			@ Ch(e,f,g)
+	eor	r0,r5,r5,ror#11
+	add	r4,r4,r2			@ h+=Ch(e,f,g)
+#if 31==31
+	and	r3,r3,#0xff
+	cmp	r3,#0xf2			@ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+	ldr	r2,[r1],#4			@ prefetch
+# else
+	ldrb	r2,[r1,#3]
+# endif
+	eor	r3,r5,r6			@ a^b, b^c in next round
+#else
+	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
+	eor	r3,r5,r6			@ a^b, b^c in next round
+	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
+#endif
+	eor	r0,r0,r5,ror#20	@ Sigma0(a)
+	and	r12,r12,r3			@ (b^c)&=(a^b)
+	add	r8,r8,r4			@ d+=h
+	eor	r12,r12,r6			@ Maj(a,b,c)
+	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
+	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+	ite	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r3,[sp,#16*4]		@ pull ctx
+	bne	Lrounds_16_xx
+
+	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
+	ldr	r0,[r3,#0]
+	ldr	r2,[r3,#4]
+	ldr	r12,[r3,#8]
+	add	r4,r4,r0
+	ldr	r0,[r3,#12]
+	add	r5,r5,r2
+	ldr	r2,[r3,#16]
+	add	r6,r6,r12
+	ldr	r12,[r3,#20]
+	add	r7,r7,r0
+	ldr	r0,[r3,#24]
+	add	r8,r8,r2
+	ldr	r2,[r3,#28]
+	add	r9,r9,r12
+	ldr	r1,[sp,#17*4]		@ pull inp
+	ldr	r12,[sp,#18*4]		@ pull inp+len
+	add	r10,r10,r0
+	add	r11,r11,r2
+	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+	cmp	r1,r12
+	sub	r14,r14,#256	@ rewind Ktbl
+	bne	Loop
+
+	add	sp,sp,#19*4	@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl	_sha256_block_data_order_neon
+.private_extern	_sha256_block_data_order_neon
+#ifdef __thumb2__
+.thumb_func	_sha256_block_data_order_neon
+#endif
+.align	5
+.skip	16
+_sha256_block_data_order_neon:
+LNEON:
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+	sub	r11,sp,#16*4+16
+	adr	r14,K256
+	bic	r11,r11,#15		@ align for 128-bit stores
+	mov	r12,sp
+	mov	sp,r11			@ alloca
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+
+	vld1.8	{q0},[r1]!
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	vld1.32	{q8},[r14,:128]!
+	vld1.32	{q9},[r14,:128]!
+	vld1.32	{q10},[r14,:128]!
+	vld1.32	{q11},[r14,:128]!
+	vrev32.8	q0,q0		@ yes, even on
+	str	r0,[sp,#64]
+	vrev32.8	q1,q1		@ big-endian
+	str	r1,[sp,#68]
+	mov	r1,sp
+	vrev32.8	q2,q2
+	str	r2,[sp,#72]
+	vrev32.8	q3,q3
+	str	r12,[sp,#76]		@ save original sp
+	vadd.i32	q8,q8,q0
+	vadd.i32	q9,q9,q1
+	vst1.32	{q8},[r1,:128]!
+	vadd.i32	q10,q10,q2
+	vst1.32	{q9},[r1,:128]!
+	vadd.i32	q11,q11,q3
+	vst1.32	{q10},[r1,:128]!
+	vst1.32	{q11},[r1,:128]!
+
+	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+	sub	r1,r1,#64
+	ldr	r2,[sp,#0]
+	eor	r12,r12,r12
+	eor	r3,r5,r6
+	b	L_00_48
+
+.align	4
+L_00_48:
+	vext.8	q8,q0,q1,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q2,q3,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q0,q0,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d7,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d7,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d7,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q0,q0,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d7,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d7,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d0,d0,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d0,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d0,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d0,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	vshr.u32	d24,d0,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d0,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d1,d1,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q0
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q1,q2,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q3,q0,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q1,q1,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d1,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d1,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d1,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q1,q1,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d1,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d1,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d2,d2,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d2,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d2,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d2,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	vshr.u32	d24,d2,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d2,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d3,d3,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q1
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vext.8	q8,q2,q3,#4
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	vext.8	q9,q0,q1,#4
+	add	r4,r4,r12
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vadd.i32	q2,q2,q9
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	veor	q9,q9,q10
+	add	r10,r10,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	vshr.u32	d24,d3,#17
+	add	r11,r11,r3
+	and	r2,r2,r7
+	veor	q9,q9,q11
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	vsli.32	d24,d3,#15
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	vshr.u32	d25,d3,#10
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	vadd.i32	q2,q2,q9
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r6,r6,r10
+	vshr.u32	d24,d3,#19
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	vsli.32	d24,d3,#13
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	veor	d25,d25,d24
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	vadd.i32	d4,d4,d25
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	vshr.u32	d24,d4,#17
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	vsli.32	d24,d4,#15
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	vshr.u32	d25,d4,#10
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	vshr.u32	d24,d4,#19
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	vld1.32	{q8},[r14,:128]!
+	add	r8,r8,r2
+	vsli.32	d24,d4,#13
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	veor	d25,d25,d24
+	add	r9,r9,r3
+	and	r2,r2,r5
+	vadd.i32	d5,d5,d25
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	vadd.i32	q8,q8,q2
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	vst1.32	{q8},[r1,:128]!
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vext.8	q8,q3,q0,#4
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	vext.8	q9,q1,q2,#4
+	add	r8,r8,r12
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	vshr.u32	q10,q8,#7
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vadd.i32	q3,q3,q9
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	vshr.u32	q9,q8,#3
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vsli.32	q10,q8,#25
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	vshr.u32	q11,q8,#18
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	veor	q9,q9,q10
+	add	r6,r6,r2
+	vsli.32	q11,q8,#14
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	vshr.u32	d24,d5,#17
+	add	r7,r7,r3
+	and	r2,r2,r11
+	veor	q9,q9,q11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	vsli.32	d24,d5,#15
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	vshr.u32	d25,d5,#10
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	vadd.i32	q3,q3,q9
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	veor	d25,d25,d24
+	and	r12,r12,r3
+	add	r10,r10,r6
+	vshr.u32	d24,d5,#19
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	vsli.32	d24,d5,#13
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	veor	d25,d25,d24
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	vadd.i32	d6,d6,d25
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	vshr.u32	d24,d6,#17
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	vsli.32	d24,d6,#15
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	vshr.u32	d25,d6,#10
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	veor	d25,d25,d24
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	vshr.u32	d24,d6,#19
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	vld1.32	{q8},[r14,:128]!
+	add	r4,r4,r2
+	vsli.32	d24,d6,#13
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	veor	d25,d25,d24
+	add	r5,r5,r3
+	and	r2,r2,r9
+	vadd.i32	d7,d7,d25
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	vadd.i32	q8,q8,q3
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[r14]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	vst1.32	{q8},[r1,:128]!
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	teq	r2,#0				@ check for K256 terminator
+	ldr	r2,[sp,#0]
+	sub	r1,r1,#64
+	bne	L_00_48
+
+	ldr	r1,[sp,#68]
+	ldr	r0,[sp,#72]
+	sub	r14,r14,#256	@ rewind r14
+	teq	r1,r0
+	it	eq
+	subeq	r1,r1,#64		@ avoid SEGV
+	vld1.8	{q0},[r1]!		@ load next input block
+	vld1.8	{q1},[r1]!
+	vld1.8	{q2},[r1]!
+	vld1.8	{q3},[r1]!
+	it	ne
+	strne	r1,[sp,#68]
+	mov	r1,sp
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q0,q0
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q0
+	ldr	r2,[sp,#4]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#8]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#12]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#16]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q1,q1
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q1
+	ldr	r2,[sp,#20]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#24]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#28]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#32]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	add	r11,r11,r2
+	eor	r2,r9,r10
+	eor	r0,r8,r8,ror#5
+	add	r4,r4,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r8
+	eor	r12,r0,r8,ror#19
+	eor	r0,r4,r4,ror#11
+	eor	r2,r2,r10
+	vrev32.8	q2,q2
+	add	r11,r11,r12,ror#6
+	eor	r12,r4,r5
+	eor	r0,r0,r4,ror#20
+	add	r11,r11,r2
+	vadd.i32	q8,q8,q2
+	ldr	r2,[sp,#36]
+	and	r3,r3,r12
+	add	r7,r7,r11
+	add	r11,r11,r0,ror#2
+	eor	r3,r3,r5
+	add	r10,r10,r2
+	eor	r2,r8,r9
+	eor	r0,r7,r7,ror#5
+	add	r11,r11,r3
+	and	r2,r2,r7
+	eor	r3,r0,r7,ror#19
+	eor	r0,r11,r11,ror#11
+	eor	r2,r2,r9
+	add	r10,r10,r3,ror#6
+	eor	r3,r11,r4
+	eor	r0,r0,r11,ror#20
+	add	r10,r10,r2
+	ldr	r2,[sp,#40]
+	and	r12,r12,r3
+	add	r6,r6,r10
+	add	r10,r10,r0,ror#2
+	eor	r12,r12,r4
+	add	r9,r9,r2
+	eor	r2,r7,r8
+	eor	r0,r6,r6,ror#5
+	add	r10,r10,r12
+	and	r2,r2,r6
+	eor	r12,r0,r6,ror#19
+	eor	r0,r10,r10,ror#11
+	eor	r2,r2,r8
+	add	r9,r9,r12,ror#6
+	eor	r12,r10,r11
+	eor	r0,r0,r10,ror#20
+	add	r9,r9,r2
+	ldr	r2,[sp,#44]
+	and	r3,r3,r12
+	add	r5,r5,r9
+	add	r9,r9,r0,ror#2
+	eor	r3,r3,r11
+	add	r8,r8,r2
+	eor	r2,r6,r7
+	eor	r0,r5,r5,ror#5
+	add	r9,r9,r3
+	and	r2,r2,r5
+	eor	r3,r0,r5,ror#19
+	eor	r0,r9,r9,ror#11
+	eor	r2,r2,r7
+	add	r8,r8,r3,ror#6
+	eor	r3,r9,r10
+	eor	r0,r0,r9,ror#20
+	add	r8,r8,r2
+	ldr	r2,[sp,#48]
+	and	r12,r12,r3
+	add	r4,r4,r8
+	add	r8,r8,r0,ror#2
+	eor	r12,r12,r10
+	vst1.32	{q8},[r1,:128]!
+	add	r7,r7,r2
+	eor	r2,r5,r6
+	eor	r0,r4,r4,ror#5
+	add	r8,r8,r12
+	vld1.32	{q8},[r14,:128]!
+	and	r2,r2,r4
+	eor	r12,r0,r4,ror#19
+	eor	r0,r8,r8,ror#11
+	eor	r2,r2,r6
+	vrev32.8	q3,q3
+	add	r7,r7,r12,ror#6
+	eor	r12,r8,r9
+	eor	r0,r0,r8,ror#20
+	add	r7,r7,r2
+	vadd.i32	q8,q8,q3
+	ldr	r2,[sp,#52]
+	and	r3,r3,r12
+	add	r11,r11,r7
+	add	r7,r7,r0,ror#2
+	eor	r3,r3,r9
+	add	r6,r6,r2
+	eor	r2,r4,r5
+	eor	r0,r11,r11,ror#5
+	add	r7,r7,r3
+	and	r2,r2,r11
+	eor	r3,r0,r11,ror#19
+	eor	r0,r7,r7,ror#11
+	eor	r2,r2,r5
+	add	r6,r6,r3,ror#6
+	eor	r3,r7,r8
+	eor	r0,r0,r7,ror#20
+	add	r6,r6,r2
+	ldr	r2,[sp,#56]
+	and	r12,r12,r3
+	add	r10,r10,r6
+	add	r6,r6,r0,ror#2
+	eor	r12,r12,r8
+	add	r5,r5,r2
+	eor	r2,r11,r4
+	eor	r0,r10,r10,ror#5
+	add	r6,r6,r12
+	and	r2,r2,r10
+	eor	r12,r0,r10,ror#19
+	eor	r0,r6,r6,ror#11
+	eor	r2,r2,r4
+	add	r5,r5,r12,ror#6
+	eor	r12,r6,r7
+	eor	r0,r0,r6,ror#20
+	add	r5,r5,r2
+	ldr	r2,[sp,#60]
+	and	r3,r3,r12
+	add	r9,r9,r5
+	add	r5,r5,r0,ror#2
+	eor	r3,r3,r7
+	add	r4,r4,r2
+	eor	r2,r10,r11
+	eor	r0,r9,r9,ror#5
+	add	r5,r5,r3
+	and	r2,r2,r9
+	eor	r3,r0,r9,ror#19
+	eor	r0,r5,r5,ror#11
+	eor	r2,r2,r11
+	add	r4,r4,r3,ror#6
+	eor	r3,r5,r6
+	eor	r0,r0,r5,ror#20
+	add	r4,r4,r2
+	ldr	r2,[sp,#64]
+	and	r12,r12,r3
+	add	r8,r8,r4
+	add	r4,r4,r0,ror#2
+	eor	r12,r12,r6
+	vst1.32	{q8},[r1,:128]!
+	ldr	r0,[r2,#0]
+	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
+	ldr	r12,[r2,#4]
+	ldr	r3,[r2,#8]
+	ldr	r1,[r2,#12]
+	add	r4,r4,r0			@ accumulate
+	ldr	r0,[r2,#16]
+	add	r5,r5,r12
+	ldr	r12,[r2,#20]
+	add	r6,r6,r3
+	ldr	r3,[r2,#24]
+	add	r7,r7,r1
+	ldr	r1,[r2,#28]
+	add	r8,r8,r0
+	str	r4,[r2],#4
+	add	r9,r9,r12
+	str	r5,[r2],#4
+	add	r10,r10,r3
+	str	r6,[r2],#4
+	add	r11,r11,r1
+	str	r7,[r2],#4
+	stmia	r2,{r8,r9,r10,r11}
+
+	ittte	ne
+	movne	r1,sp
+	ldrne	r2,[sp,#0]
+	eorne	r12,r12,r12
+	ldreq	sp,[sp,#76]			@ restore original sp
+	itt	ne
+	eorne	r3,r5,r6
+	bne	L_00_48
+
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)	.byte	a,b,c,d
+# endif
+
+#ifdef __thumb2__
+.thumb_func	sha256_block_data_order_armv8
+#endif
+.align	5
+sha256_block_data_order_armv8:
+LARMv8:
+	vld1.32	{q0,q1},[r0]
+	sub	r3,r3,#256+32
+	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
+	b	Loop_v8
+
+.align	4
+Loop_v8:
+	vld1.8	{q8,q9},[r1]!
+	vld1.8	{q10,q11},[r1]!
+	vld1.32	{q12},[r3]!
+	vrev32.8	q8,q8
+	vrev32.8	q9,q9
+	vrev32.8	q10,q10
+	vrev32.8	q11,q11
+	vmov	q14,q0	@ offload
+	vmov	q15,q1
+	teq	r1,r2
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q10
+	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q11
+	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
+	vld1.32	{q13},[r3]!
+	vadd.i32	q12,q12,q8
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vld1.32	{q12},[r3]!
+	vadd.i32	q13,q13,q9
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vld1.32	{q13},[r3]
+	vadd.i32	q12,q12,q10
+	sub	r3,r3,#256-16	@ rewind
+	vmov	q2,q0
+	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
+	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
+
+	vadd.i32	q13,q13,q11
+	vmov	q2,q0
+	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
+	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
+
+	vadd.i32	q0,q0,q14
+	vadd.i32	q1,q1,q15
+	it	ne
+	bne	Loop_v8
+
+	vst1.32	{q0,q1},[r0]
+
+	bx	lr		@ bx lr
+
+#endif
+.byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/sha512-armv4.S b/apple-arm/crypto/fipsmodule/sha512-armv4.S
new file mode 100644
index 0000000..21913cb
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/sha512-armv4.S
@@ -0,0 +1,1899 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License").  You may not use
+@ this file except in compliance with the License.  You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax	unified
+.thumb
+# define adrl adr
+#else
+.code	32
+#endif
+
+
+.align	5
+K512:
+	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
+	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
+	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
+	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
+	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
+	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
+	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
+	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
+	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
+	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
+	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
+	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
+	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
+	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
+	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
+	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
+	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
+	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
+	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
+	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
+	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
+	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
+	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
+	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
+	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
+	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
+	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
+	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
+	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
+	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
+	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
+	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
+	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
+	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
+	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
+	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
+	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
+	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
+	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
+	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
+
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-Lsha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.globl	_sha512_block_data_order
+.private_extern	_sha512_block_data_order
+#ifdef __thumb2__
+.thumb_func	_sha512_block_data_order
+#endif
+_sha512_block_data_order:
+Lsha512_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+	sub	r3,pc,#8		@ _sha512_block_data_order
+#else
+	adr	r3,Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+#ifdef	__APPLE__
+	ldr	r12,[r12]
+#endif
+	tst	r12,#ARMV7_NEON
+	bne	LNEON
+#endif
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	sub	r14,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+L00_15:
+#if __ARM_ARCH__<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+
+
+
+.globl	_sha512_block_data_order_neon
+.private_extern	_sha512_block_data_order_neon
+#ifdef __thumb2__
+.thumb_func	_sha512_block_data_order_neon
+#endif
+.align	4
+_sha512_block_data_order_neon:
+LNEON:
+	dmb	@ errata #451034 on early Cortex A8
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	adr	r3,K512
+	VFP_ABI_PUSH
+	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
+Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64	{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 0>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 1>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64	{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 2>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 3>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64	{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 4>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 5>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64	{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 6>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 7>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64	{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 8>0
+	vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 9>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64	{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 10>0
+	vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 11>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64	{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 12>0
+	vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 13>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64	{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 14>0
+	vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 15>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	mov	r12,#4
+L16_79_neon:
+	subs	r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q7,#6
+	vsli.64	q12,q7,#45
+	vext.8	q14,q0,q1,#8	@ X[i+1]
+	vsli.64	q13,q7,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q4,q5,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64	{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 17>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q0,#6
+	vsli.64	q12,q0,#45
+	vext.8	q14,q1,q2,#8	@ X[i+1]
+	vsli.64	q13,q0,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q5,q6,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64	{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 19>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q1,#6
+	vsli.64	q12,q1,#45
+	vext.8	q14,q2,q3,#8	@ X[i+1]
+	vsli.64	q13,q1,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q6,q7,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64	{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 21>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q2,#6
+	vsli.64	q12,q2,#45
+	vext.8	q14,q3,q4,#8	@ X[i+1]
+	vsli.64	q13,q2,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q7,q0,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64	{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 23>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q3,#6
+	vsli.64	q12,q3,#45
+	vext.8	q14,q4,q5,#8	@ X[i+1]
+	vsli.64	q13,q3,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q0,q1,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d20,#50
+	vsli.64	d25,d20,#46
+	vmov	d29,d20
+	vsli.64	d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64	d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64	d25,d16,#30
+	veor	d30,d16,d17
+	vsli.64	d26,d16,#25
+	veor	d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d18,d17		@ Maj(a,b,c)
+	veor	d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64	{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 25>0
+	vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d19,#50
+	vsli.64	d25,d19,#46
+	vmov	d29,d19
+	vsli.64	d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64	d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64	d25,d23,#30
+	veor	d30,d23,d16
+	vsli.64	d26,d23,#25
+	veor	d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d17,d16		@ Maj(a,b,c)
+	veor	d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q4,#6
+	vsli.64	q12,q4,#45
+	vext.8	q14,q5,q6,#8	@ X[i+1]
+	vsli.64	q13,q4,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q1,q2,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d18,#50
+	vsli.64	d25,d18,#46
+	vmov	d29,d18
+	vsli.64	d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64	d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64	d25,d22,#30
+	veor	d30,d22,d23
+	vsli.64	d26,d22,#25
+	veor	d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d16,d23		@ Maj(a,b,c)
+	veor	d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64	{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 27>0
+	vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d17,#50
+	vsli.64	d25,d17,#46
+	vmov	d29,d17
+	vsli.64	d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64	d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64	d25,d21,#30
+	veor	d30,d21,d22
+	vsli.64	d26,d21,#25
+	veor	d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d23,d22		@ Maj(a,b,c)
+	veor	d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q5,#6
+	vsli.64	q12,q5,#45
+	vext.8	q14,q6,q7,#8	@ X[i+1]
+	vsli.64	q13,q5,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q2,q3,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d16,#50
+	vsli.64	d25,d16,#46
+	vmov	d29,d16
+	vsli.64	d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64	d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64	d25,d20,#30
+	veor	d30,d20,d21
+	vsli.64	d26,d20,#25
+	veor	d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d22,d21		@ Maj(a,b,c)
+	veor	d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64	{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 29>0
+	vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d23,#50
+	vsli.64	d25,d23,#46
+	vmov	d29,d23
+	vsli.64	d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64	d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64	d25,d19,#30
+	veor	d30,d19,d20
+	vsli.64	d26,d19,#25
+	veor	d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d21,d20		@ Maj(a,b,c)
+	veor	d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q6,#6
+	vsli.64	q12,q6,#45
+	vext.8	q14,q7,q0,#8	@ X[i+1]
+	vsli.64	q13,q6,#3
+	veor	q15,q12
+	vshr.u64	q12,q14,#1
+	veor	q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64	q12,q14,#63
+	vsli.64	q13,q14,#56
+	vext.8	q14,q3,q4,#8	@ X[i+9]
+	veor	q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor	q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d22,#50
+	vsli.64	d25,d22,#46
+	vmov	d29,d22
+	vsli.64	d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64	d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64	d25,d18,#30
+	veor	d30,d18,d19
+	vsli.64	d26,d18,#25
+	veor	d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d20,d19		@ Maj(a,b,c)
+	veor	d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64	{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 31>0
+	vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64	{d28},[r3,:64]!	@ K[i++]
+	vsli.64	d24,d21,#50
+	vsli.64	d25,d21,#46
+	vmov	d29,d21
+	vsli.64	d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor	d25,d24
+	vbsl	d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor	d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64	d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64	d25,d17,#30
+	veor	d30,d17,d18
+	vsli.64	d26,d17,#25
+	veor	d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl	d30,d19,d18		@ Maj(a,b,c)
+	veor	d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	bne	L16_79_neon
+
+	vadd.i64	d16,d30		@ h+=Maj from the past
+	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
+	teq	r1,r2
+	sub	r3,#640	@ rewind K512
+	bne	Loop_neon
+
+	VFP_ABI_POP
+	bx	lr				@ .word	0xe12fff1e
+
+#endif
+.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	2
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	_OPENSSL_armcap_P,4
+.non_lazy_symbol_pointer
+OPENSSL_armcap_P:
+.indirect_symbol	_OPENSSL_armcap_P
+.long	0
+.private_extern	_OPENSSL_armcap_P
+#endif
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/fipsmodule/vpaes-armv7.S b/apple-arm/crypto/fipsmodule/vpaes-armv7.S
new file mode 100644
index 0000000..6aead7c
--- /dev/null
+++ b/apple-arm/crypto/fipsmodule/vpaes-armv7.S
@@ -0,0 +1,1265 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax	unified
+
+
+
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code	32
+#endif
+
+.text
+
+
+.align	7	@ totally strategic alignment
+_vpaes_consts:
+Lk_mc_forward:@ mc_forward
+.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+Lk_mc_backward:@ mc_backward
+.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+Lk_sr:@ sr
+.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+.quad	0x0F060D040B020900, 0x070E050C030A0108
+.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+Lk_inv:@ inv, inva
+.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+Lk_ipt:@ input transform (lo, hi)
+.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+Lk_sbo:@ sbou, sbot
+.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+Lk_sb1:@ sb1u, sb1t
+.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+Lk_sb2:@ sb2u, sb2t
+.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align	2
+
+.align	6
+@@
+@@  _aes_preheat
+@@
+@@  Fills q9-q15 as specified below.
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_preheat
+#endif
+.align	4
+_vpaes_preheat:
+	adr	r10, Lk_inv
+	vmov.i8	q9, #0x0f		@ Lk_s0F
+	vld1.64	{q10,q11}, [r10]!	@ Lk_inv
+	add	r10, r10, #64		@ Skip Lk_ipt, Lk_sbo
+	vld1.64	{q12,q13}, [r10]!	@ Lk_sb1
+	vld1.64	{q14,q15}, [r10]	@ Lk_sb2
+	bx	lr
+
+@@
+@@  _aes_encrypt_core
+@@
+@@  AES-encrypt q0.
+@@
+@@  Inputs:
+@@     q0 = input
+@@     q9-q15 as in _vpaes_preheat
+@@    [r2] = scheduled keys
+@@
+@@  Output in q0
+@@  Clobbers  q1-q5, r8-r11
+@@  Preserves q6-q8 so you get some local vectors
+@@
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_encrypt_core
+#endif
+.align	4
+_vpaes_encrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+	adr	r11, Lk_ipt
+	@ vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
+	@ vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
+	vld1.64	{q2, q3}, [r11]
+	adr	r11, Lk_mc_forward+16
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5		# round0 key
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d2, {q2}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm1
+	vtbl.8	d3, {q2}, d3
+	vtbl.8	d4, {q3}, d0	@ vpshufb	%xmm0,	%xmm3,	%xmm2
+	vtbl.8	d5, {q3}, d1
+	veor	q0, q1, q5		@ vpxor	%xmm5,	%xmm1,	%xmm0
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+
+	@ .Lenc_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Lenc_loop.
+	tst	r8, r8
+	b	Lenc_entry
+
+.align	4
+Lenc_loop:
+	@ middle of middle round
+	add	r10, r11, #0x40
+	vtbl.8	d8, {q13}, d4	@ vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
+	vtbl.8	d9, {q13}, d5
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
+	vtbl.8	d0, {q12}, d6	@ vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
+	vtbl.8	d1, {q12}, d7
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	vtbl.8	d10, {q15}, d4	@ vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
+	vtbl.8	d11, {q15}, d5
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0	# 0 = A
+	vtbl.8	d4, {q14}, d6	@ vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
+	vtbl.8	d5, {q14}, d7
+	vld1.64	{q4}, [r10]		@ vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
+	vtbl.8	d6, {q0}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
+	vtbl.8	d7, {q0}, d3
+	veor	q2, q2, q5		@ vpxor		%xmm5,	%xmm2,	%xmm2	# 2 = 2A
+	@ Write to q5 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d10, {q0}, d8	@ vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
+	vtbl.8	d11, {q0}, d9
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
+	vtbl.8	d8, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
+	vtbl.8	d9, {q3}, d3
+	@ Here we restore the original q0/q5 usage.
+	veor	q0, q5, q3		@ vpxor		%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
+	and	r11, r11, #~(1<<6)	@ and		$0x30,	%r11		# ... mod 4
+	veor	q0, q0, q4		@ vpxor		%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
+	subs	r8, r8, #1		@ nr--
+
+Lenc_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm0,	%xmm9,	%xmm1   # 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d10, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
+	vtbl.8	d11, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q5		@ vpxor		%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q5		@ vpxor		%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2  	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,	%xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q5}, [r9]!		@ vmovdqu	(%r9),	%xmm5
+	bne	Lenc_loop
+
+	@ middle of last round
+	add	r10, r11, #0x80
+
+	adr	r11, Lk_sbo
+	@ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+	@ overlap table and destination registers.
+	vld1.64	{q1}, [r11]!		@ vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou
+	vld1.64	{q0}, [r11]		@ vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	Lk_sbo+16
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	vld1.64	{q1}, [r10]		@ vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
+	@ Write to q2 instead of q0 below, to avoid overlapping table and
+	@ destination registers.
+	vtbl.8	d4, {q0}, d6	@ vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
+	vtbl.8	d5, {q0}, d7
+	veor	q4, q4, q5		@ vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
+	veor	q2, q2, q4		@ vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
+	@ Here we restore the original q0/q2 usage.
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0
+	vtbl.8	d1, {q2}, d3
+	bx	lr
+
+
+.globl	_vpaes_encrypt
+.private_extern	_vpaes_encrypt
+#ifdef __thumb2__
+.thumb_func	_vpaes_encrypt
+#endif
+.align	4
+_vpaes_encrypt:
+	@ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+	@ alignment.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_encrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+
+
+@
+@  Decryption stuff
+@
+
+.align	4
+_vpaes_decrypt_consts:
+Lk_dipt:@ decryption input transform
+.quad	0x0F505B040B545F00, 0x154A411E114E451A
+.quad	0x86E383E660056500, 0x12771772F491F194
+Lk_dsbo:@ decryption sbox final output
+.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad	0xD022649296B44200, 0x602646F6B0F2D404
+.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+
+@@
+@@  Decryption core
+@@
+@@  Same API as encryption core, except it clobbers q12-q15 rather than using
+@@  the values from _vpaes_preheat. q9-q11 must still be set from
+@@  _vpaes_preheat.
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_decrypt_core
+#endif
+.align	4
+_vpaes_decrypt_core:
+	mov	r9, r2
+	ldr	r8, [r2,#240]		@ pull rounds
+
+	@ This function performs shuffles with various constants. The x86_64
+	@ version loads them on-demand into %xmm0-%xmm5. This does not work well
+	@ for ARMv7 because those registers are shuffle destinations. The ARMv8
+	@ version preloads those constants into registers, but ARMv7 has half
+	@ the registers to work with. Instead, we load them on-demand into
+	@ q12-q15, registers normally use for preloaded constants. This is fine
+	@ because decryption doesn't use those constants. The values are
+	@ constant, so this does not interfere with potential 2x optimizations.
+	adr	r7, Lk_dipt
+
+	vld1.64	{q12,q13}, [r7]		@ vmovdqa	Lk_dipt(%rip), %xmm2	# iptlo
+	lsl	r11, r8, #4		@ mov		%rax,	%r11;	shl	$4, %r11
+	eor	r11, r11, #0x30		@ xor		$0x30,	%r11
+	adr	r10, Lk_sr
+	and	r11, r11, #0x30		@ and		$0x30,	%r11
+	add	r11, r11, r10
+	adr	r10, Lk_mc_forward+48
+
+	vld1.64	{q4}, [r9]!		@ vmovdqu	(%r9),	%xmm4		# round0 key
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q12}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q12}, d3
+	vld1.64	{q5}, [r10]		@ vmovdqa	Lk_mc_forward+48(%rip), %xmm5
+					@ vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
+	vtbl.8	d0, {q13}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q13}, d1
+	veor	q2, q2, q4		@ vpxor		%xmm4,	%xmm2,	%xmm2
+	veor	q0, q0, q2		@ vpxor		%xmm2,	%xmm0,	%xmm0
+
+	@ .Ldec_entry ends with a bnz instruction which is normally paired with
+	@ subs in .Ldec_loop.
+	tst	r8, r8
+	b	Ldec_entry
+
+.align	4
+Ldec_loop:
+@
+@  Inverse mix columns
+@
+
+	@ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+	@ the function.
+	adr	r10, Lk_dsb9
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
+					@ vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
+					@ vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
+	vtbl.8	d9, {q12}, d5
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q4, q0		@ vpxor		%xmm4,	%xmm0,	%xmm0
+
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	@ Load sbb* ahead of time.
+	vld1.64	{q12,q13}, [r10]!	@ vmovdqa	0x20(%r10),%xmm4		# 4 : sbbu
+					@ vmovdqa	0x30(%r10),%xmm1		# 0 : sbbt
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
+	vtbl.8	d3, {q15}, d7
+					@ vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+					@ vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
+
+	@ Load sbd* ahead of time.
+	vld1.64	{q14,q15}, [r10]!	@ vmovdqa	0x40(%r10),%xmm4		# 4 : sbeu
+					@ vmovdqa	0x50(%r10),%xmm1		# 0 : sbet
+
+	vtbl.8	d8, {q12}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
+	vtbl.8	d9, {q12}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q13}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
+	vtbl.8	d3, {q13}, d7
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+
+	vtbl.8	d8, {q14}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
+	vtbl.8	d9, {q14}, d5
+	@ Write to q1 instead of q0, so the table and destination registers do
+	@ not overlap.
+	vtbl.8	d2, {q0}, d10	@ vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
+	vtbl.8	d3, {q0}, d11
+	@ Here we restore the original q0/q1 usage. This instruction is
+	@ reordered from the ARMv8 version so we do not clobber the vtbl.8
+	@ below.
+	veor	q0, q1, q4		@ vpxor		%xmm4,	%xmm0,	%xmm0		# 4 = ch
+	vtbl.8	d2, {q15}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
+	vtbl.8	d3, {q15}, d7
+	vext.8	q5, q5, q5, #12		@ vpalignr 	$12,	%xmm5,	%xmm5,	%xmm5
+	veor	q0, q0, q1		@ vpxor		%xmm1,	%xmm0,	%xmm0		# 0 = ch
+	subs	r8, r8, #1		@ sub		$1,%rax			# nr--
+
+Ldec_entry:
+	@ top of round
+	vand	q1, q0, q9		@ vpand		%xmm9,	%xmm0,	%xmm1	# 0 = k
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
+	vtbl.8	d4, {q11}, d2	@ vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0		@ vpxor		%xmm0,	%xmm1,	%xmm1	# 0 = j
+	vtbl.8	d6, {q10}, d0	@ vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	vtbl.8	d8, {q10}, d2	@ vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q3, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
+	veor	q4, q4, q2		@ vpxor		%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d6	@ vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
+	vtbl.8	d5, {q10}, d7
+	vtbl.8	d6, {q10}, d8	@ vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
+	vtbl.8	d7, {q10}, d9
+	veor	q2, q2, q1		@ vpxor		%xmm1,	%xmm2,	%xmm2	# 2 = io
+	veor	q3, q3, q0		@ vpxor		%xmm0,  %xmm3,	%xmm3	# 3 = jo
+	vld1.64	{q0}, [r9]!		@ vmovdqu	(%r9),	%xmm0
+	bne	Ldec_loop
+
+	@ middle of last round
+
+	adr	r10, Lk_dsbo
+
+	@ Write to q1 rather than q4 to avoid overlapping table and destination.
+	vld1.64	{q1}, [r10]!		@ vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
+	vtbl.8	d8, {q1}, d4	@ vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
+	vtbl.8	d9, {q1}, d5
+	@ Write to q2 rather than q1 to avoid overlapping table and destination.
+	vld1.64	{q2}, [r10]		@ vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
+	vtbl.8	d2, {q2}, d6	@ vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
+	vtbl.8	d3, {q2}, d7
+	vld1.64	{q2}, [r11]		@ vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
+	veor	q4, q4, q0		@ vpxor		%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
+	@ Write to q1 rather than q0 so the table and destination registers
+	@ below do not overlap.
+	veor	q1, q1, q4		@ vpxor		%xmm4,	%xmm1,	%xmm0	# 0 = A
+	vtbl.8	d0, {q1}, d4	@ vpshufb	%xmm2,	%xmm0,	%xmm0
+	vtbl.8	d1, {q1}, d5
+	bx	lr
+
+
+.globl	_vpaes_decrypt
+.private_extern	_vpaes_decrypt
+#ifdef __thumb2__
+.thumb_func	_vpaes_decrypt
+#endif
+.align	4
+_vpaes_decrypt:
+	@ _vpaes_decrypt_core uses r7-r11.
+	stmdb	sp!, {r7,r8,r9,r10,r11,lr}
+	@ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11}
+
+	vld1.64	{q0}, [r0]
+	bl	_vpaes_preheat
+	bl	_vpaes_decrypt_core
+	vst1.64	{q0}, [r1]
+
+	vldmia	sp!, {d8,d9,d10,d11}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@                                                    @@
+@@                  AES key schedule                  @@
+@@                                                    @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@  Key schedule constants
+@
+
+.align	4
+_vpaes_key_consts:
+Lk_dksd:@ decryption key schedule: invskew x*D
+.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+Lk_dksb:@ decryption key schedule: invskew x*B
+.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+Lk_dks9:@ decryption key schedule: invskew x*9
+.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+Lk_rcon:@ rcon
+.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+Lk_opt:@ output transform
+.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+#ifdef __thumb2__
+.thumb_func	_vpaes_key_preheat
+#endif
+.align	4
+_vpaes_key_preheat:
+	adr	r11, Lk_rcon
+	vmov.i8	q12, #0x5b			@ Lk_s63
+	adr	r10, Lk_inv			@ Must be aligned to 8 mod 16.
+	vmov.i8	q9, #0x0f			@ Lk_s0F
+	vld1.64	{q10,q11}, [r10]		@ Lk_inv
+	vld1.64	{q8}, [r11]			@ Lk_rcon
+	bx	lr
+
+
+#ifdef __thumb2__
+.thumb_func	_vpaes_schedule_core
+#endif
+.align	4
+_vpaes_schedule_core:
+	@ We only need to save lr, but ARM requires an 8-byte stack alignment,
+	@ so save an extra register.
+	stmdb	sp!, {r3,lr}
+
+	bl	_vpaes_key_preheat	@ load the tables
+
+	adr	r11, Lk_ipt		@ Must be aligned to 8 mod 16.
+	vld1.64	{q0}, [r0]!		@ vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	@ input transform
+	@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+	@ overlap table and destination.
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm3
+	bl	_vpaes_schedule_transform
+	adr	r10, Lk_sr		@ Must be aligned to 8 mod 16.
+	vmov	q7, q0			@ vmovdqa	%xmm0,	%xmm7
+
+	add	r8, r8, r10
+	tst	r3, r3
+	bne	Lschedule_am_decrypting
+
+	@ encrypting, output zeroth round key after transform
+	vst1.64	{q0}, [r2]		@ vmovdqu	%xmm0,	(%rdx)
+	b	Lschedule_go
+
+Lschedule_am_decrypting:
+	@ decrypting, output zeroth round key after shiftrows
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	vtbl.8	d6, {q4}, d2	@ vpshufb  	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q4}, d3
+	vst1.64	{q3}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	eor	r8, r8, #0x30		@ xor	$0x30, %r8
+
+Lschedule_go:
+	cmp	r1, #192		@ cmp	$192,	%esi
+	bhi	Lschedule_256
+	beq	Lschedule_192
+	@ 128: fall though
+
+@@
+@@  .schedule_128
+@@
+@@  128-bit specific part of key schedule.
+@@
+@@  This schedule is really simple, because all its parts
+@@  are accomplished by the subroutines.
+@@
+Lschedule_128:
+	mov	r0, #10		@ mov	$10, %esi
+
+Loop_schedule_128:
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1		@ dec	%esi
+	beq	Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle	@ write output
+	b	Loop_schedule_128
+
+@@
+@@  .aes_schedule_192
+@@
+@@  192-bit specific part of key schedule.
+@@
+@@  The main body of this schedule is the same as the 128-bit
+@@  schedule, but with more smearing.  The long, high side is
+@@  stored in q7 as before, and the short, low side is in
+@@  the high bits of q6.
+@@
+@@  This schedule is somewhat nastier, however, because each
+@@  round produces 192 bits of key material, or 1.5 round keys.
+@@  Therefore, on each cycle we do 2 rounds and produce 3 round
+@@  keys.
+@@
+.align	4
+Lschedule_192:
+	sub	r0, r0, #8
+	vld1.64	{q0}, [r0]			@ vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save short part
+	vmov.i8	d12, #0			@ vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
+						@ vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
+	mov	r0, #4			@ mov	$4,	%esi
+
+Loop_schedule_192:
+	bl	_vpaes_schedule_round
+	vext.8	q0, q6, q0, #8			@ vpalignr	$8,%xmm6,%xmm0,%xmm0
+	bl	_vpaes_schedule_mangle		@ save key n
+	bl	_vpaes_schedule_192_smear
+	bl	_vpaes_schedule_mangle		@ save key n+1
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle		@ save key n+2
+	bl	_vpaes_schedule_192_smear
+	b	Loop_schedule_192
+
+@@
+@@  .aes_schedule_256
+@@
+@@  256-bit specific part of key schedule.
+@@
+@@  The structure here is very similar to the 128-bit
+@@  schedule, but with an additional "low side" in
+@@  q6.  The low side's rounds are the same as the
+@@  high side's, except no rcon and no rotation.
+@@
+.align	4
+Lschedule_256:
+	vld1.64	{q0}, [r0]			@ vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	bl	_vpaes_schedule_transform	@ input transform
+	mov	r0, #7			@ mov	$7, %esi
+
+Loop_schedule_256:
+	bl	_vpaes_schedule_mangle		@ output low result
+	vmov	q6, q0				@ vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	@ high round
+	bl	_vpaes_schedule_round
+	subs	r0, r0, #1			@ dec	%esi
+	beq	Lschedule_mangle_last
+	bl	_vpaes_schedule_mangle
+
+	@ low round. swap xmm7 and xmm6
+	vdup.32	q0, d1[1]		@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vmov.i8	q4, #0
+	vmov	q5, q7			@ vmovdqa	%xmm7,	%xmm5
+	vmov	q7, q6			@ vmovdqa	%xmm6,	%xmm7
+	bl	_vpaes_schedule_low_round
+	vmov	q7, q5			@ vmovdqa	%xmm5,	%xmm7
+
+	b	Loop_schedule_256
+
+@@
+@@  .aes_schedule_mangle_last
+@@
+@@  Mangler for last round of key schedule
+@@  Mangles q0
+@@    when encrypting, outputs out(q0) ^ 63
+@@    when decrypting, outputs unskew(q0)
+@@
+@@  Always called right before return... jumps to cleanup and exits
+@@
+.align	4
+Lschedule_mangle_last:
+	@ schedule last round key from xmm0
+	adr	r11, Lk_deskew			@ lea	Lk_deskew(%rip),%r11	# prepare to deskew
+	tst	r3, r3
+	bne	Lschedule_mangle_last_dec
+
+	@ encrypting
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),%xmm1
+	adr	r11, Lk_opt		@ lea		Lk_opt(%rip),	%r11		# prepare to output transform
+	add	r2, r2, #32		@ add		$32,	%rdx
+	vmov	q2, q0
+	vtbl.8	d0, {q2}, d2	@ vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
+	vtbl.8	d1, {q2}, d3
+
+Lschedule_mangle_last_dec:
+	sub	r2, r2, #16			@ add	$-16,	%rdx
+	veor	q0, q0, q12			@ vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
+	bl	_vpaes_schedule_transform	@ output transform
+	vst1.64	{q0}, [r2]			@ vmovdqu	%xmm0,	(%rdx)		# save last key
+
+	@ cleanup
+	veor	q0, q0, q0		@ vpxor	%xmm0,	%xmm0,	%xmm0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q2, q2, q2		@ vpxor	%xmm2,	%xmm2,	%xmm2
+	veor	q3, q3, q3		@ vpxor	%xmm3,	%xmm3,	%xmm3
+	veor	q4, q4, q4		@ vpxor	%xmm4,	%xmm4,	%xmm4
+	veor	q5, q5, q5		@ vpxor	%xmm5,	%xmm5,	%xmm5
+	veor	q6, q6, q6		@ vpxor	%xmm6,	%xmm6,	%xmm6
+	veor	q7, q7, q7		@ vpxor	%xmm7,	%xmm7,	%xmm7
+	ldmia	sp!, {r3,pc}		@ return
+
+
+@@
+@@  .aes_schedule_192_smear
+@@
+@@  Smear the short, low side in the 192-bit key schedule.
+@@
+@@  Inputs:
+@@    q7: high side, b  a  x  y
+@@    q6:  low side, d  c  0  0
+@@
+@@  Outputs:
+@@    q6: b+c+d  b+c  0  0
+@@    q0: b+c+d  b+c  b  a
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_schedule_192_smear
+#endif
+.align	4
+_vpaes_schedule_192_smear:
+	vmov.i8	q1, #0
+	vdup.32	q0, d15[1]
+	vshl.i64	q1, q6, #32		@ vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
+	vmov	d0, d15		@ vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	veor	q6, q6, q1		@ vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
+	veor	q1, q1, q1		@ vpxor	%xmm1,	%xmm1,	%xmm1
+	veor	q6, q6, q0		@ vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
+	vmov	q0, q6			@ vmovdqa	%xmm6,	%xmm0
+	vmov	d12, d2		@ vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
+	bx	lr
+
+
+@@
+@@  .aes_schedule_round
+@@
+@@  Runs one main round of the key schedule on q0, q7
+@@
+@@  Specifically, runs subbytes on the high dword of q0
+@@  then rotates it by one byte and xors into the low dword of
+@@  q7.
+@@
+@@  Adds rcon from low byte of q8, then rotates q8 for
+@@  next rcon.
+@@
+@@  Smears the dwords of q7 by xoring the low into the
+@@  second low, result into third, result into highest.
+@@
+@@  Returns results in q7 = q0.
+@@  Clobbers q1-q4, r11.
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_schedule_round
+#endif
+.align	4
+_vpaes_schedule_round:
+	@ extract rcon from xmm8
+	vmov.i8	q4, #0				@ vpxor		%xmm4,	%xmm4,	%xmm4
+	vext.8	q1, q8, q4, #15		@ vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
+	vext.8	q8, q8, q8, #15	@ vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
+	veor	q7, q7, q1			@ vpxor		%xmm1,	%xmm7,	%xmm7
+
+	@ rotate
+	vdup.32	q0, d1[1]			@ vpshufd	$0xFF,	%xmm0,	%xmm0
+	vext.8	q0, q0, q0, #1			@ vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
+
+	@ fall through...
+
+	@ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	@ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+	@ We pin other values in _vpaes_key_preheat, so load them now.
+	adr	r11, Lk_sb1
+	vld1.64	{q14,q15}, [r11]
+
+	@ smear xmm7
+	vext.8	q1, q4, q7, #12			@ vpslldq	$4,	%xmm7,	%xmm1
+	veor	q7, q7, q1			@ vpxor	%xmm1,	%xmm7,	%xmm7
+	vext.8	q4, q4, q7, #8			@ vpslldq	$8,	%xmm7,	%xmm4
+
+	@ subbytes
+	vand	q1, q0, q9			@ vpand		%xmm9,	%xmm0,	%xmm1		# 0 = k
+	vshr.u8	q0, q0, #4			@ vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
+	veor	q7, q7, q4			@ vpxor		%xmm4,	%xmm7,	%xmm7
+	vtbl.8	d4, {q11}, d2		@ vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
+	vtbl.8	d5, {q11}, d3
+	veor	q1, q1, q0			@ vpxor		%xmm0,	%xmm1,	%xmm1		# 0 = j
+	vtbl.8	d6, {q10}, d0		@ vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
+	vtbl.8	d7, {q10}, d1
+	veor	q3, q3, q2			@ vpxor		%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
+	vtbl.8	d8, {q10}, d2		@ vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
+	vtbl.8	d9, {q10}, d3
+	veor	q7, q7, q12			@ vpxor		Lk_s63(%rip),	%xmm7,	%xmm7
+	vtbl.8	d6, {q10}, d6		@ vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
+	vtbl.8	d7, {q10}, d7
+	veor	q4, q4, q2			@ vpxor		%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
+	vtbl.8	d4, {q10}, d8		@ vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
+	vtbl.8	d5, {q10}, d9
+	veor	q3, q3, q1			@ vpxor		%xmm1,	%xmm3,	%xmm3		# 2 = io
+	veor	q2, q2, q0			@ vpxor		%xmm0,	%xmm2,	%xmm2		# 3 = jo
+	vtbl.8	d8, {q15}, d6		@ vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
+	vtbl.8	d9, {q15}, d7
+	vtbl.8	d2, {q14}, d4		@ vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
+	vtbl.8	d3, {q14}, d5
+	veor	q1, q1, q4			@ vpxor		%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
+
+	@ add in smeared stuff
+	veor	q0, q1, q7			@ vpxor	%xmm7,	%xmm1,	%xmm0
+	veor	q7, q1, q7			@ vmovdqa	%xmm0,	%xmm7
+	bx	lr
+
+
+@@
+@@  .aes_schedule_transform
+@@
+@@  Linear-transform q0 according to tables at [r11]
+@@
+@@  Requires that q9 = 0x0F0F... as in preheat
+@@  Output in q0
+@@  Clobbers q1, q2, q14, q15
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_schedule_transform
+#endif
+.align	4
+_vpaes_schedule_transform:
+	vld1.64	{q14,q15}, [r11]	@ vmovdqa	(%r11),	%xmm2 	# lo
+					@ vmovdqa	16(%r11),	%xmm1 # hi
+	vand	q1, q0, q9		@ vpand	%xmm9,	%xmm0,	%xmm1
+	vshr.u8	q0, q0, #4		@ vpsrlb	$4,	%xmm0,	%xmm0
+	vtbl.8	d4, {q14}, d2	@ vpshufb	%xmm1,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d3
+	vtbl.8	d0, {q15}, d0	@ vpshufb	%xmm0,	%xmm1,	%xmm0
+	vtbl.8	d1, {q15}, d1
+	veor	q0, q0, q2		@ vpxor	%xmm2,	%xmm0,	%xmm0
+	bx	lr
+
+
+@@
+@@  .aes_schedule_mangle
+@@
+@@  Mangles q0 from (basis-transformed) standard version
+@@  to our version.
+@@
+@@  On encrypt,
+@@    xor with 0x63
+@@    multiply by circulant 0,1,1,1
+@@    apply shiftrows transform
+@@
+@@  On decrypt,
+@@    xor with 0x63
+@@    multiply by "inverse mixcolumns" circulant E,B,D,9
+@@    deskew
+@@    apply shiftrows transform
+@@
+@@
+@@  Writes out to [r2], and increments or decrements it
+@@  Keeps track of round number mod 4 in r8
+@@  Preserves q0
+@@  Clobbers q1-q5
+@@
+#ifdef __thumb2__
+.thumb_func	_vpaes_schedule_mangle
+#endif
+.align	4
+_vpaes_schedule_mangle:
+	tst	r3, r3
+	vmov	q4, q0			@ vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
+	adr	r11, Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	vld1.64	{q5}, [r11]		@ vmovdqa	Lk_mc_forward(%rip),%xmm5
+	bne	Lschedule_mangle_dec
+
+	@ encrypting
+	@ Write to q2 so we do not overlap table and destination below.
+	veor	q2, q0, q12		@ vpxor		Lk_s63(%rip),	%xmm0,	%xmm4
+	add	r2, r2, #16		@ add		$16,	%rdx
+	vtbl.8	d8, {q2}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm4
+	vtbl.8	d9, {q2}, d11
+	vtbl.8	d2, {q4}, d10	@ vpshufb	%xmm5,	%xmm4,	%xmm1
+	vtbl.8	d3, {q4}, d11
+	vtbl.8	d6, {q1}, d10	@ vpshufb	%xmm5,	%xmm1,	%xmm3
+	vtbl.8	d7, {q1}, d11
+	veor	q4, q4, q1		@ vpxor		%xmm1,	%xmm4,	%xmm4
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q3, q3, q4		@ vpxor		%xmm4,	%xmm3,	%xmm3
+
+	b	Lschedule_mangle_both
+.align	4
+Lschedule_mangle_dec:
+	@ inverse mix columns
+	adr	r11, Lk_dksd 		@ lea		Lk_dksd(%rip),%r11
+	vshr.u8	q1, q4, #4		@ vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
+	vand	q4, q4, q9		@ vpand		%xmm9,	%xmm4,	%xmm4	# 4 = lo
+
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x00(%r11),	%xmm2
+					@ vmovdqa	0x10(%r11),	%xmm3
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dksb ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x20(%r11),	%xmm2
+					@ vmovdqa	0x30(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x40(%r11),	%xmm2
+					@ vmovdqa	0x50(%r11),	%xmm3
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	veor	q2, q2, q3		@ vpxor		%xmm3,	%xmm2,	%xmm2
+	vtbl.8	d6, {q15}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d7, {q15}, d3
+	@ Load .Lk_dkse ahead of time.
+	vld1.64	{q14,q15}, [r11]! 	@ vmovdqa	0x60(%r11),	%xmm2
+					@ vmovdqa	0x70(%r11),	%xmm4
+	@ Write to q13 so we do not overlap table and destination.
+	veor	q13, q3, q2		@ vpxor		%xmm2,	%xmm3,	%xmm3
+
+	vtbl.8	d4, {q14}, d8	@ vpshufb	%xmm4,	%xmm2,	%xmm2
+	vtbl.8	d5, {q14}, d9
+	vtbl.8	d6, {q13}, d10	@ vpshufb	%xmm5,	%xmm3,	%xmm3
+	vtbl.8	d7, {q13}, d11
+	vtbl.8	d8, {q15}, d2	@ vpshufb	%xmm1,	%xmm4,	%xmm4
+	vtbl.8	d9, {q15}, d3
+	vld1.64	{q1}, [r8]		@ vmovdqa	(%r8,%r10),	%xmm1
+	veor	q2, q2, q3		@ vpxor	%xmm3,	%xmm2,	%xmm2
+	veor	q3, q4, q2		@ vpxor	%xmm2,	%xmm4,	%xmm3
+
+	sub	r2, r2, #16		@ add	$-16,	%rdx
+
+Lschedule_mangle_both:
+	@ Write to q2 so table and destination do not overlap.
+	vtbl.8	d4, {q3}, d2	@ vpshufb	%xmm1,	%xmm3,	%xmm3
+	vtbl.8	d5, {q3}, d3
+	add	r8, r8, #64-16		@ add	$-16,	%r8
+	and	r8, r8, #~(1<<6)	@ and	$0x30,	%r8
+	vst1.64	{q2}, [r2]		@ vmovdqu	%xmm3,	(%rdx)
+	bx	lr
+
+
+.globl	_vpaes_set_encrypt_key
+.private_extern	_vpaes_set_encrypt_key
+#ifdef __thumb2__
+.thumb_func	_vpaes_set_encrypt_key
+#endif
+.align	4
+_vpaes_set_encrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	r3, #0		@ mov	$0,%ecx
+	mov	r8, #0x30		@ mov	$0x30,%r8d
+	bl	_vpaes_schedule_core
+	eor	r0, r0, r0
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+
+
+.globl	_vpaes_set_decrypt_key
+.private_extern	_vpaes_set_decrypt_key
+#ifdef __thumb2__
+.thumb_func	_vpaes_set_decrypt_key
+#endif
+.align	4
+_vpaes_set_decrypt_key:
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	lsr	r9, r1, #5		@ shr	$5,%eax
+	add	r9, r9, #5		@ $5,%eax
+	str	r9, [r2,#240]		@ mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	lsl	r9, r9, #4		@ shl	$4,%eax
+	add	r2, r2, #16		@ lea	16(%rdx,%rax),%rdx
+	add	r2, r2, r9
+
+	mov	r3, #1		@ mov	$1,%ecx
+	lsr	r8, r1, #1		@ shr	$1,%r8d
+	and	r8, r8, #32		@ and	$32,%r8d
+	eor	r8, r8, #32		@ xor	$32,%r8d	# nbits==192?0:32
+	bl	_vpaes_schedule_core
+
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+
+
+@ Additional constants for converting to bsaes.
+
+.align	4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@   def u64s_to_u128(x, y):
+@       return x | (y << 64)
+@   def u128_to_u64s(w):
+@       return w & ((1<<64)-1), w >> 64
+@   def get_byte(w, i):
+@       return (w >> (i*8)) & 0xff
+@   def apply_table(table, b):
+@       lo = b & 0xf
+@       hi = b >> 4
+@       return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@   def opt(b):
+@       table = [
+@           u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@           u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@       ]
+@       return apply_table(table, b)
+@   def rot_byte(b, n):
+@       return 0xff & ((b << n) | (b >> (8-n)))
+@   def skew(x):
+@       return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@               rot_byte(x, 4))
+@   table = [0, 0]
+@   for i in range(16):
+@       table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@       table[1] |= skew(opt(i<<4)) << (i*8)
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@   print("	.quad	0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+Lk_opt_then_skew:
+.quad	0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad	0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+Lk_decrypt_transform:
+.quad	0x0704050603000102, 0x0f0c0d0e0b08090a
+
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl	_vpaes_encrypt_key_to_bsaes
+.private_extern	_vpaes_encrypt_key_to_bsaes
+#ifdef __thumb2__
+.thumb_func	_vpaes_encrypt_key_to_bsaes
+#endif
+.align	4
+_vpaes_encrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. In particular,
+	@ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+	@ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+	@ contain the transformations not in the bsaes representation. This
+	@ function inverts those transforms.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+	adr	r2, Lk_mc_forward	@ Must be aligned to 8 mod 16.
+	add	r3, r2, 0x90		@ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+	vld1.64	{q12}, [r2]
+	vmov.i8	q10, #0x5b		@ Lk_s63 from vpaes-x86_64
+	adr	r11, Lk_opt		@ Must be aligned to 8 mod 16.
+	vmov.i8	q11, #0x63		@ LK_s63 without Lk_ipt applied
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+	@ Invert this with .Lk_opt.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+	@ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+	@ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+Loop_enc_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+	@ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+	@ We use r3 rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	Loop_enc_key_to_bsaes_last
+
+	@ Multiply by the circulant. This is its own inverse.
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+	vmov	q0, q1
+	vtbl.8	d4, {q1}, d24
+	vtbl.8	d5, {q1}, d25
+	veor	q0, q0, q2
+	vtbl.8	d2, {q2}, d24
+	vtbl.8	d3, {q2}, d25
+	veor	q0, q0, q1
+
+	@ XOR and finish.
+	veor	q0, q0, q10
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+	b	Loop_enc_key_to_bsaes
+
+Loop_enc_key_to_bsaes_last:
+	@ The final key does not have a basis transform (note
+	@ .Lschedule_mangle_last inverts the original transform). It only XORs
+	@ 0x63 and applies ShiftRows. The latter was already inverted in the
+	@ loop. Note that, because we act on the original representation, we use
+	@ q11, not q10.
+	veor	q0, q0, q11
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl	_vpaes_decrypt_key_to_bsaes
+.private_extern	_vpaes_decrypt_key_to_bsaes
+#ifdef __thumb2__
+.thumb_func	_vpaes_decrypt_key_to_bsaes
+#endif
+.align	4
+_vpaes_decrypt_key_to_bsaes:
+	stmdb	sp!, {r11, lr}
+
+	@ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+	@ computes the decryption key schedule in reverse. Additionally,
+	@ aes-x86_64.pl shares some transformations, so we must only partially
+	@ invert vpaes's transformations. In general, vpaes computes in a
+	@ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+	@ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+	@ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+	@
+	@ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+	@ representation, which does not match the other aes_nohw_*
+	@ implementations. The ARM aes_nohw_* stores each 32-bit word
+	@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+	@ cost of extra REV and VREV32 operations in little-endian ARM.
+
+	adr	r2, Lk_decrypt_transform
+	adr	r3, Lk_sr+0x30
+	adr	r11, Lk_opt_then_skew	@ Input to _vpaes_schedule_transform.
+	vld1.64	{q12}, [r2]	@ Reuse q12 from encryption.
+	vmov.i8	q9, #0x0f		@ Required by _vpaes_schedule_transform
+
+	@ vpaes stores one fewer round count than bsaes, but the number of keys
+	@ is the same.
+	ldr	r2, [r1,#240]
+	add	r2, r2, #1
+	str	r2, [r0,#240]
+
+	@ Undo the basis change and reapply the S-box affine transform. See
+	@ .Lschedule_mangle_last.
+	vld1.64	{q0}, [r1]!
+	bl	_vpaes_schedule_transform
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+	@ it simultaneously inverts MixColumns and the S-box affine transform.
+	@ See .Lk_dksd through .Lk_dks9.
+Loop_dec_key_to_bsaes:
+	vld1.64	{q0}, [r1]!
+
+	@ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+	@ forwards cancels inverting for which direction we cycle r3. We use r3
+	@ rather than r8 to avoid a callee-saved register.
+	vld1.64	{q1}, [r3]
+	vtbl.8	d4, {q0}, d2
+	vtbl.8	d5, {q0}, d3
+	add	r3, r3, #64-16
+	and	r3, r3, #~(1<<6)
+	vmov	q0, q2
+
+	@ Handle the last key differently.
+	subs	r2, r2, #1
+	beq	Loop_dec_key_to_bsaes_last
+
+	@ Undo the basis change and reapply the S-box affine transform.
+	bl	_vpaes_schedule_transform
+
+	@ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+	@ combine the two operations in .Lk_decrypt_transform.
+	@
+	@ TODO(davidben): Where does the rotation come from?
+	vtbl.8	d2, {q0}, d24
+	vtbl.8	d3, {q0}, d25
+
+	vst1.64	{q1}, [r0]!
+	b	Loop_dec_key_to_bsaes
+
+Loop_dec_key_to_bsaes_last:
+	@ The final key only inverts ShiftRows (already done in the loop). See
+	@ .Lschedule_am_decrypting. Its basis is not transformed.
+	vrev32.8	q0, q0
+	vst1.64	{q0}, [r0]!
+
+	@ Wipe registers which contained key material.
+	veor	q0, q0, q0
+	veor	q1, q1, q1
+	veor	q2, q2, q2
+
+	ldmia	sp!, {r11, pc}	@ return
+
+.globl	_vpaes_ctr32_encrypt_blocks
+.private_extern	_vpaes_ctr32_encrypt_blocks
+#ifdef __thumb2__
+.thumb_func	_vpaes_ctr32_encrypt_blocks
+#endif
+.align	4
+_vpaes_ctr32_encrypt_blocks:
+	mov	ip, sp
+	stmdb	sp!, {r7,r8,r9,r10,r11, lr}
+	@ This function uses q4-q7 (d8-d15), which are callee-saved.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	cmp	r2, #0
+	@ r8 is passed on the stack.
+	ldr	r8, [ip]
+	beq	Lctr32_done
+
+	@ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+	mov	r9, r3
+	mov	r3, r2
+	mov	r2, r9
+
+	@ Load the IV and counter portion.
+	ldr	r7, [r8, #12]
+	vld1.8	{q7}, [r8]
+
+	bl	_vpaes_preheat
+	rev	r7, r7		@ The counter is big-endian.
+
+Lctr32_loop:
+	vmov	q0, q7
+	vld1.8	{q6}, [r0]!		@ Load input ahead of time
+	bl	_vpaes_encrypt_core
+	veor	q0, q0, q6		@ XOR input and result
+	vst1.8	{q0}, [r1]!
+	subs	r3, r3, #1
+	@ Update the counter.
+	add	r7, r7, #1
+	rev	r9, r7
+	vmov.32	d15[1], r9
+	bne	Lctr32_loop
+
+Lctr32_done:
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	ldmia	sp!, {r7,r8,r9,r10,r11, pc}	@ return
+
+#endif  // !OPENSSL_NO_ASM
diff --git a/apple-arm/crypto/test/trampoline-armv4.S b/apple-arm/crypto/test/trampoline-armv4.S
new file mode 100644
index 0000000..9d74f55
--- /dev/null
+++ b/apple-arm/crypto/test/trampoline-armv4.S
@@ -0,0 +1,376 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax	unified
+
+
+
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@                              const uint32_t *argv, size_t argc,
+@                              int unwind);
+
+.globl	_abi_test_trampoline
+.private_extern	_abi_test_trampoline
+.align	4
+_abi_test_trampoline:
+	@ Save parameters and all callee-saved registers. For convenience, we
+	@ save r9 on iOS even though it's volatile.
+	vstmdb	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+	stmdb	sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+	@ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+	@ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+	sub	sp, sp, #28
+
+	@ Every register in AAPCS is either non-volatile or a parameter (except
+	@ r9 on iOS), so this code, by the actual call, loses all its scratch
+	@ registers. First fill in stack parameters while there are registers
+	@ to spare.
+	cmp	r3, #4
+	bls	Lstack_args_done
+	mov	r4, sp				@ r4 is the output pointer.
+	add	r5, r2, r3, lsl #2	@ Set r5 to the end of argv.
+	add	r2, r2, #16		@ Skip four arguments.
+Lstack_args_loop:
+	ldr	r6, [r2], #4
+	cmp	r2, r5
+	str	r6, [r4], #4
+	bne	Lstack_args_loop
+
+Lstack_args_done:
+	@ Load registers from |r1|.
+	vldmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	ldmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	ldmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Load register parameters. This uses up our remaining registers, so we
+	@ repurpose lr as scratch space.
+	ldr	r3, [sp, #40]	@ Reload argc.
+	ldr	lr, [sp, #36]		@ Load argv into lr.
+	cmp	r3, #3
+	bhi	Larg_r3
+	beq	Larg_r2
+	cmp	r3, #1
+	bhi	Larg_r1
+	beq	Larg_r0
+	b	Largs_done
+
+Larg_r3:
+	ldr	r3, [lr, #12]	@ argv[3]
+Larg_r2:
+	ldr	r2, [lr, #8]	@ argv[2]
+Larg_r1:
+	ldr	r1, [lr, #4]	@ argv[1]
+Larg_r0:
+	ldr	r0, [lr]	@ argv[0]
+Largs_done:
+
+	@ With every other register in use, load the function pointer into lr
+	@ and call the function.
+	ldr	lr, [sp, #28]
+	blx	lr
+
+	@ r1-r3 are free for use again. The trampoline only supports
+	@ single-return functions. Pass r4-r11 to the caller.
+	ldr	r1, [sp, #32]
+	vstmia	r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+	@ r9 is not volatile on iOS.
+	stmia	r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+	stmia	r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+	@ Unwind the stack and restore registers.
+	add	sp, sp, #44		@ 44 = 28+16
+	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr}	@ Skip r0-r3 (see +16 above).
+	vldmia	sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+	bx	lr
+
+
+.globl	_abi_test_clobber_r0
+.private_extern	_abi_test_clobber_r0
+.align	4
+_abi_test_clobber_r0:
+	mov	r0, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r1
+.private_extern	_abi_test_clobber_r1
+.align	4
+_abi_test_clobber_r1:
+	mov	r1, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r2
+.private_extern	_abi_test_clobber_r2
+.align	4
+_abi_test_clobber_r2:
+	mov	r2, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r3
+.private_extern	_abi_test_clobber_r3
+.align	4
+_abi_test_clobber_r3:
+	mov	r3, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r4
+.private_extern	_abi_test_clobber_r4
+.align	4
+_abi_test_clobber_r4:
+	mov	r4, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r5
+.private_extern	_abi_test_clobber_r5
+.align	4
+_abi_test_clobber_r5:
+	mov	r5, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r6
+.private_extern	_abi_test_clobber_r6
+.align	4
+_abi_test_clobber_r6:
+	mov	r6, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r7
+.private_extern	_abi_test_clobber_r7
+.align	4
+_abi_test_clobber_r7:
+	mov	r7, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r8
+.private_extern	_abi_test_clobber_r8
+.align	4
+_abi_test_clobber_r8:
+	mov	r8, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r9
+.private_extern	_abi_test_clobber_r9
+.align	4
+_abi_test_clobber_r9:
+	mov	r9, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r10
+.private_extern	_abi_test_clobber_r10
+.align	4
+_abi_test_clobber_r10:
+	mov	r10, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r11
+.private_extern	_abi_test_clobber_r11
+.align	4
+_abi_test_clobber_r11:
+	mov	r11, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_r12
+.private_extern	_abi_test_clobber_r12
+.align	4
+_abi_test_clobber_r12:
+	mov	r12, #0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d0
+.private_extern	_abi_test_clobber_d0
+.align	4
+_abi_test_clobber_d0:
+	mov	r0, #0
+	vmov	s0, r0
+	vmov	s1, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d1
+.private_extern	_abi_test_clobber_d1
+.align	4
+_abi_test_clobber_d1:
+	mov	r0, #0
+	vmov	s2, r0
+	vmov	s3, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d2
+.private_extern	_abi_test_clobber_d2
+.align	4
+_abi_test_clobber_d2:
+	mov	r0, #0
+	vmov	s4, r0
+	vmov	s5, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d3
+.private_extern	_abi_test_clobber_d3
+.align	4
+_abi_test_clobber_d3:
+	mov	r0, #0
+	vmov	s6, r0
+	vmov	s7, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d4
+.private_extern	_abi_test_clobber_d4
+.align	4
+_abi_test_clobber_d4:
+	mov	r0, #0
+	vmov	s8, r0
+	vmov	s9, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d5
+.private_extern	_abi_test_clobber_d5
+.align	4
+_abi_test_clobber_d5:
+	mov	r0, #0
+	vmov	s10, r0
+	vmov	s11, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d6
+.private_extern	_abi_test_clobber_d6
+.align	4
+_abi_test_clobber_d6:
+	mov	r0, #0
+	vmov	s12, r0
+	vmov	s13, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d7
+.private_extern	_abi_test_clobber_d7
+.align	4
+_abi_test_clobber_d7:
+	mov	r0, #0
+	vmov	s14, r0
+	vmov	s15, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d8
+.private_extern	_abi_test_clobber_d8
+.align	4
+_abi_test_clobber_d8:
+	mov	r0, #0
+	vmov	s16, r0
+	vmov	s17, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d9
+.private_extern	_abi_test_clobber_d9
+.align	4
+_abi_test_clobber_d9:
+	mov	r0, #0
+	vmov	s18, r0
+	vmov	s19, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d10
+.private_extern	_abi_test_clobber_d10
+.align	4
+_abi_test_clobber_d10:
+	mov	r0, #0
+	vmov	s20, r0
+	vmov	s21, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d11
+.private_extern	_abi_test_clobber_d11
+.align	4
+_abi_test_clobber_d11:
+	mov	r0, #0
+	vmov	s22, r0
+	vmov	s23, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d12
+.private_extern	_abi_test_clobber_d12
+.align	4
+_abi_test_clobber_d12:
+	mov	r0, #0
+	vmov	s24, r0
+	vmov	s25, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d13
+.private_extern	_abi_test_clobber_d13
+.align	4
+_abi_test_clobber_d13:
+	mov	r0, #0
+	vmov	s26, r0
+	vmov	s27, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d14
+.private_extern	_abi_test_clobber_d14
+.align	4
+_abi_test_clobber_d14:
+	mov	r0, #0
+	vmov	s28, r0
+	vmov	s29, r0
+	bx	lr
+
+
+.globl	_abi_test_clobber_d15
+.private_extern	_abi_test_clobber_d15
+.align	4
+_abi_test_clobber_d15:
+	mov	r0, #0
+	vmov	s30, r0
+	vmov	s31, r0
+	bx	lr
+
+#endif  // !OPENSSL_NO_ASM
diff --git a/mac-x86/crypto/chacha/chacha-x86.S b/apple-x86/crypto/chacha/chacha-x86.S
similarity index 100%
rename from mac-x86/crypto/chacha/chacha-x86.S
rename to apple-x86/crypto/chacha/chacha-x86.S
diff --git a/mac-x86/crypto/fipsmodule/aesni-x86.S b/apple-x86/crypto/fipsmodule/aesni-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/aesni-x86.S
rename to apple-x86/crypto/fipsmodule/aesni-x86.S
diff --git a/mac-x86/crypto/fipsmodule/bn-586.S b/apple-x86/crypto/fipsmodule/bn-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/bn-586.S
rename to apple-x86/crypto/fipsmodule/bn-586.S
diff --git a/mac-x86/crypto/fipsmodule/co-586.S b/apple-x86/crypto/fipsmodule/co-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/co-586.S
rename to apple-x86/crypto/fipsmodule/co-586.S
diff --git a/mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/ghash-ssse3-x86.S
rename to apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S
diff --git a/mac-x86/crypto/fipsmodule/ghash-x86.S b/apple-x86/crypto/fipsmodule/ghash-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/ghash-x86.S
rename to apple-x86/crypto/fipsmodule/ghash-x86.S
diff --git a/mac-x86/crypto/fipsmodule/md5-586.S b/apple-x86/crypto/fipsmodule/md5-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/md5-586.S
rename to apple-x86/crypto/fipsmodule/md5-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha1-586.S b/apple-x86/crypto/fipsmodule/sha1-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha1-586.S
rename to apple-x86/crypto/fipsmodule/sha1-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha256-586.S b/apple-x86/crypto/fipsmodule/sha256-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha256-586.S
rename to apple-x86/crypto/fipsmodule/sha256-586.S
diff --git a/mac-x86/crypto/fipsmodule/sha512-586.S b/apple-x86/crypto/fipsmodule/sha512-586.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/sha512-586.S
rename to apple-x86/crypto/fipsmodule/sha512-586.S
diff --git a/mac-x86/crypto/fipsmodule/vpaes-x86.S b/apple-x86/crypto/fipsmodule/vpaes-x86.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/vpaes-x86.S
rename to apple-x86/crypto/fipsmodule/vpaes-x86.S
diff --git a/mac-x86/crypto/fipsmodule/x86-mont.S b/apple-x86/crypto/fipsmodule/x86-mont.S
similarity index 100%
rename from mac-x86/crypto/fipsmodule/x86-mont.S
rename to apple-x86/crypto/fipsmodule/x86-mont.S
diff --git a/mac-x86/crypto/test/trampoline-x86.S b/apple-x86/crypto/test/trampoline-x86.S
similarity index 100%
rename from mac-x86/crypto/test/trampoline-x86.S
rename to apple-x86/crypto/test/trampoline-x86.S
diff --git a/mac-x86_64/crypto/chacha/chacha-x86_64.S b/apple-x86_64/crypto/chacha/chacha-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/chacha/chacha-x86_64.S
rename to apple-x86_64/crypto/chacha/chacha-x86_64.S
diff --git a/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
rename to apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
diff --git a/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
rename to apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S b/apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/aesni-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/aesni-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S b/apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/ghash-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/ghash-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/md5-x86_64.S b/apple-x86_64/crypto/fipsmodule/md5-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/md5-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/md5-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
rename to apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
diff --git a/mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
rename to apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
diff --git a/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/rsaz-avx2.S
rename to apple-x86_64/crypto/fipsmodule/rsaz-avx2.S
diff --git a/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/sha1-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha1-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
similarity index 93%
rename from mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
index d94268d..00dc01c 100644
--- a/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S
+++ b/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -24,6 +24,8 @@
 	movl	0(%r11),%r9d
 	movl	4(%r11),%r10d
 	movl	8(%r11),%r11d
+	testl	$536870912,%r11d
+	jnz	L$shaext_shortcut
 	andl	$1073741824,%r9d
 	andl	$268435968,%r10d
 	orl	%r9d,%r10d
@@ -1782,6 +1784,215 @@
 .byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 
 .p2align	6
+sha256_block_data_order_shaext:
+
+L$shaext_shortcut:
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	L$oop_shaext
+
+.p2align	4
+L$oop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	L$oop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+
+
+
+.p2align	6
 sha256_block_data_order_ssse3:
 
 L$ssse3_shortcut:
diff --git a/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S b/apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/sha512-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/sha512-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S
rename to apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S
diff --git a/mac-x86_64/crypto/fipsmodule/x86_64-mont.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/x86_64-mont.S
rename to apple-x86_64/crypto/fipsmodule/x86_64-mont.S
diff --git a/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S b/apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
similarity index 100%
rename from mac-x86_64/crypto/fipsmodule/x86_64-mont5.S
rename to apple-x86_64/crypto/fipsmodule/x86_64-mont5.S
diff --git a/mac-x86_64/crypto/test/trampoline-x86_64.S b/apple-x86_64/crypto/test/trampoline-x86_64.S
similarity index 100%
rename from mac-x86_64/crypto/test/trampoline-x86_64.S
rename to apple-x86_64/crypto/test/trampoline-x86_64.S
diff --git a/err_data.c b/err_data.c
index 8432081..de52cc0 100644
--- a/err_data.c
+++ b/err_data.c
@@ -193,50 +193,50 @@
     0x283480b9,
     0x283500f7,
     0x28358c94,
-    0x2c323286,
+    0x2c323284,
     0x2c32932e,
-    0x2c333294,
-    0x2c33b2a6,
-    0x2c3432ba,
-    0x2c34b2cc,
-    0x2c3532e7,
-    0x2c35b2f9,
-    0x2c363329,
+    0x2c333292,
+    0x2c33b2a4,
+    0x2c3432b8,
+    0x2c34b2ca,
+    0x2c3532e5,
+    0x2c35b2f7,
+    0x2c363327,
     0x2c36833a,
-    0x2c373336,
-    0x2c37b362,
-    0x2c383387,
-    0x2c38b39e,
-    0x2c3933bc,
-    0x2c39b3cc,
-    0x2c3a33de,
-    0x2c3ab3f2,
-    0x2c3b3403,
-    0x2c3bb422,
+    0x2c373334,
+    0x2c37b360,
+    0x2c383385,
+    0x2c38b39c,
+    0x2c3933ba,
+    0x2c39b3ca,
+    0x2c3a33dc,
+    0x2c3ab3f0,
+    0x2c3b3401,
+    0x2c3bb420,
     0x2c3c1340,
     0x2c3c9356,
-    0x2c3d3467,
+    0x2c3d3465,
     0x2c3d936f,
-    0x2c3e3491,
-    0x2c3eb49f,
-    0x2c3f34b7,
-    0x2c3fb4cf,
-    0x2c4034f9,
+    0x2c3e348f,
+    0x2c3eb49d,
+    0x2c3f34b5,
+    0x2c3fb4cd,
+    0x2c4034f7,
     0x2c409241,
-    0x2c41350a,
-    0x2c41b51d,
+    0x2c413508,
+    0x2c41b51b,
     0x2c421207,
-    0x2c42b52e,
+    0x2c42b52c,
     0x2c43074a,
-    0x2c43b414,
-    0x2c443375,
-    0x2c44b4dc,
-    0x2c45330c,
-    0x2c45b348,
-    0x2c4633ac,
-    0x2c46b436,
-    0x2c47344b,
-    0x2c47b484,
+    0x2c43b412,
+    0x2c443373,
+    0x2c44b4da,
+    0x2c45330a,
+    0x2c45b346,
+    0x2c4633aa,
+    0x2c46b434,
+    0x2c473449,
+    0x2c47b482,
     0x30320000,
     0x30328015,
     0x3033001f,
@@ -433,158 +433,158 @@
     0x404ea057,
     0x404f20f1,
     0x404fa167,
-    0x405021be,
-    0x4050a1d2,
-    0x40512205,
-    0x40522215,
-    0x4052a239,
-    0x40532251,
-    0x4053a264,
-    0x40542279,
-    0x4054a29c,
-    0x405522c7,
-    0x4055a304,
-    0x40562329,
-    0x4056a342,
-    0x4057235a,
-    0x4057a36d,
-    0x40582382,
-    0x4058a3a9,
-    0x405923d8,
-    0x4059a405,
-    0x405a2419,
-    0x405aa429,
-    0x405b2441,
-    0x405ba452,
-    0x405c2465,
-    0x405ca4a4,
-    0x405d24b1,
-    0x405da4d6,
-    0x405e2514,
+    0x405021d6,
+    0x4050a1ea,
+    0x4051221d,
+    0x4052222d,
+    0x4052a251,
+    0x40532269,
+    0x4053a27c,
+    0x40542291,
+    0x4054a2b4,
+    0x405522df,
+    0x4055a31c,
+    0x40562341,
+    0x4056a35a,
+    0x40572372,
+    0x4057a385,
+    0x4058239a,
+    0x4058a3c1,
+    0x405923f0,
+    0x4059a41d,
+    0x405a2431,
+    0x405aa441,
+    0x405b2459,
+    0x405ba46a,
+    0x405c247d,
+    0x405ca4bc,
+    0x405d24c9,
+    0x405da4ee,
+    0x405e252c,
     0x405e8adb,
-    0x405f254f,
-    0x405fa55c,
-    0x4060256a,
-    0x4060a58c,
-    0x406125ed,
-    0x4061a625,
-    0x4062263c,
-    0x4062a64d,
-    0x4063269a,
-    0x4063a6af,
-    0x406426c6,
-    0x4064a6f2,
-    0x4065270d,
-    0x4065a724,
-    0x4066273c,
-    0x4066a766,
-    0x40672791,
-    0x4067a7d6,
-    0x4068281e,
-    0x4068a83f,
-    0x40692871,
-    0x4069a89f,
-    0x406a28c0,
-    0x406aa8e0,
-    0x406b2a68,
-    0x406baa8b,
-    0x406c2aa1,
-    0x406cadab,
-    0x406d2dda,
-    0x406dae02,
-    0x406e2e30,
-    0x406eae7d,
-    0x406f2ed6,
-    0x406faf0e,
-    0x40702f21,
-    0x4070af3e,
+    0x405f254d,
+    0x405fa55a,
+    0x40602568,
+    0x4060a58a,
+    0x406125eb,
+    0x4061a623,
+    0x4062263a,
+    0x4062a64b,
+    0x40632698,
+    0x4063a6ad,
+    0x406426c4,
+    0x4064a6f0,
+    0x4065270b,
+    0x4065a722,
+    0x4066273a,
+    0x4066a764,
+    0x4067278f,
+    0x4067a7d4,
+    0x4068281c,
+    0x4068a83d,
+    0x4069286f,
+    0x4069a89d,
+    0x406a28be,
+    0x406aa8de,
+    0x406b2a66,
+    0x406baa89,
+    0x406c2a9f,
+    0x406cada9,
+    0x406d2dd8,
+    0x406dae00,
+    0x406e2e2e,
+    0x406eae7b,
+    0x406f2ed4,
+    0x406faf0c,
+    0x40702f1f,
+    0x4070af3c,
     0x4071082a,
-    0x4071af50,
-    0x40722f63,
-    0x4072af99,
-    0x40732fb1,
+    0x4071af4e,
+    0x40722f61,
+    0x4072af97,
+    0x40732faf,
     0x40739540,
-    0x40742fc5,
-    0x4074afdf,
-    0x40752ff0,
-    0x4075b004,
-    0x40763012,
+    0x40742fc3,
+    0x4074afdd,
+    0x40752fee,
+    0x4075b002,
+    0x40763010,
     0x40769304,
-    0x40773037,
-    0x4077b077,
-    0x40783092,
-    0x4078b0cb,
-    0x407930e2,
-    0x4079b0f8,
-    0x407a3124,
-    0x407ab137,
-    0x407b314c,
-    0x407bb15e,
-    0x407c318f,
-    0x407cb198,
-    0x407d285a,
-    0x407da177,
-    0x407e30a7,
-    0x407ea3b9,
+    0x40773035,
+    0x4077b075,
+    0x40783090,
+    0x4078b0c9,
+    0x407930e0,
+    0x4079b0f6,
+    0x407a3122,
+    0x407ab135,
+    0x407b314a,
+    0x407bb15c,
+    0x407c318d,
+    0x407cb196,
+    0x407d2858,
+    0x407da18f,
+    0x407e30a5,
+    0x407ea3d1,
     0x407f1dcb,
     0x407f9f9e,
     0x40802101,
     0x40809df3,
-    0x40812227,
+    0x4081223f,
     0x4081a0a5,
-    0x40822e1b,
+    0x40822e19,
     0x40829b46,
-    0x40832394,
-    0x4083a6d7,
+    0x408323ac,
+    0x4083a6d5,
     0x40841e07,
-    0x4084a3f1,
-    0x40852476,
-    0x4085a5b4,
-    0x408624f6,
-    0x4086a191,
-    0x40872e61,
-    0x4087a602,
+    0x4084a409,
+    0x4085248e,
+    0x4085a5b2,
+    0x4086250e,
+    0x4086a1a9,
+    0x40872e5f,
+    0x4087a600,
     0x40881b84,
-    0x4088a7e9,
+    0x4088a7e7,
     0x40891bd3,
     0x40899b60,
-    0x408a2ad9,
+    0x408a2ad7,
     0x408a9958,
-    0x408b3173,
-    0x408baeeb,
-    0x408c2486,
+    0x408b3171,
+    0x408baee9,
+    0x408c249e,
     0x408c9990,
     0x408d1eef,
     0x408d9e39,
     0x408e201f,
-    0x408ea2e4,
-    0x408f27fd,
-    0x408fa5d0,
-    0x409027b2,
-    0x4090a4c8,
-    0x40912ac1,
+    0x408ea2fc,
+    0x408f27fb,
+    0x408fa5ce,
+    0x409027b0,
+    0x4090a4e0,
+    0x40912abf,
     0x409199b6,
     0x40921c20,
-    0x4092ae9c,
-    0x40932f7c,
-    0x4093a1a2,
+    0x4092ae9a,
+    0x40932f7a,
+    0x4093a1ba,
     0x40941e1b,
-    0x4094aaf2,
-    0x4095265e,
-    0x4095b104,
-    0x40962e48,
+    0x4094aaf0,
+    0x4095265c,
+    0x4095b102,
+    0x40962e46,
     0x4096a11a,
-    0x409721ed,
+    0x40972205,
     0x4097a06e,
     0x40981c80,
-    0x4098a672,
-    0x40992eb8,
-    0x4099a311,
-    0x409a22aa,
+    0x4098a670,
+    0x40992eb6,
+    0x4099a329,
+    0x409a22c2,
     0x409a9974,
     0x409b1e75,
     0x409b9ea0,
-    0x409c3059,
+    0x409c3057,
     0x409c9ec8,
     0x409d20d6,
     0x409da0bb,
@@ -592,42 +592,42 @@
     0x409ea14f,
     0x409f2137,
     0x409f9e68,
-    0x40a02535,
+    0x40a02177,
     0x40a0a088,
-    0x41f42993,
-    0x41f92a25,
-    0x41fe2918,
-    0x41feabce,
-    0x41ff2cfc,
-    0x420329ac,
-    0x420829ce,
-    0x4208aa0a,
-    0x420928fc,
-    0x4209aa44,
-    0x420a2953,
-    0x420aa933,
-    0x420b2973,
-    0x420ba9ec,
-    0x420c2d18,
-    0x420cab02,
-    0x420d2bb5,
-    0x420dabec,
-    0x42122c1f,
-    0x42172cdf,
-    0x4217ac61,
-    0x421c2c83,
-    0x421f2c3e,
-    0x42212d90,
-    0x42262cc2,
-    0x422b2d6e,
-    0x422bab90,
-    0x422c2d50,
-    0x422cab43,
-    0x422d2b1c,
-    0x422dad2f,
-    0x422e2b6f,
-    0x42302c9e,
-    0x4230ac06,
+    0x41f42991,
+    0x41f92a23,
+    0x41fe2916,
+    0x41feabcc,
+    0x41ff2cfa,
+    0x420329aa,
+    0x420829cc,
+    0x4208aa08,
+    0x420928fa,
+    0x4209aa42,
+    0x420a2951,
+    0x420aa931,
+    0x420b2971,
+    0x420ba9ea,
+    0x420c2d16,
+    0x420cab00,
+    0x420d2bb3,
+    0x420dabea,
+    0x42122c1d,
+    0x42172cdd,
+    0x4217ac5f,
+    0x421c2c81,
+    0x421f2c3c,
+    0x42212d8e,
+    0x42262cc0,
+    0x422b2d6c,
+    0x422bab8e,
+    0x422c2d4e,
+    0x422cab41,
+    0x422d2b1a,
+    0x422dad2d,
+    0x422e2b6d,
+    0x42302c9c,
+    0x4230ac04,
     0x44320755,
     0x44328764,
     0x44330770,
@@ -682,71 +682,71 @@
     0x4c41159d,
     0x4c419420,
     0x4c421589,
-    0x50323540,
-    0x5032b54f,
-    0x5033355a,
-    0x5033b56a,
-    0x50343583,
-    0x5034b59d,
-    0x503535ab,
-    0x5035b5c1,
-    0x503635d3,
-    0x5036b5e9,
-    0x50373602,
-    0x5037b615,
-    0x5038362d,
-    0x5038b63e,
-    0x50393653,
-    0x5039b667,
-    0x503a3687,
-    0x503ab69d,
-    0x503b36b5,
-    0x503bb6c7,
-    0x503c36e3,
-    0x503cb6fa,
-    0x503d3713,
-    0x503db729,
-    0x503e3736,
-    0x503eb74c,
-    0x503f375e,
+    0x5032353e,
+    0x5032b54d,
+    0x50333558,
+    0x5033b568,
+    0x50343581,
+    0x5034b59b,
+    0x503535a9,
+    0x5035b5bf,
+    0x503635d1,
+    0x5036b5e7,
+    0x50373600,
+    0x5037b613,
+    0x5038362b,
+    0x5038b63c,
+    0x50393651,
+    0x5039b665,
+    0x503a3685,
+    0x503ab69b,
+    0x503b36b3,
+    0x503bb6c5,
+    0x503c36e1,
+    0x503cb6f8,
+    0x503d3711,
+    0x503db727,
+    0x503e3734,
+    0x503eb74a,
+    0x503f375c,
     0x503f83a3,
-    0x50403771,
-    0x5040b781,
-    0x5041379b,
-    0x5041b7aa,
-    0x504237c4,
-    0x5042b7e1,
-    0x504337f1,
-    0x5043b801,
-    0x5044381e,
+    0x5040376f,
+    0x5040b77f,
+    0x50413799,
+    0x5041b7a8,
+    0x504237c2,
+    0x5042b7df,
+    0x504337ef,
+    0x5043b7ff,
+    0x5044381c,
     0x50448459,
-    0x50453832,
-    0x5045b850,
-    0x50463863,
-    0x5046b879,
-    0x5047388b,
-    0x5047b8a0,
-    0x504838c6,
-    0x5048b8d4,
-    0x504938e7,
-    0x5049b8fc,
-    0x504a3912,
-    0x504ab922,
-    0x504b3942,
-    0x504bb955,
-    0x504c3978,
-    0x504cb9a6,
-    0x504d39d3,
-    0x504db9f0,
-    0x504e3a0b,
-    0x504eba27,
-    0x504f3a39,
-    0x504fba50,
-    0x50503a5f,
+    0x50453830,
+    0x5045b84e,
+    0x50463861,
+    0x5046b877,
+    0x50473889,
+    0x5047b89e,
+    0x504838c4,
+    0x5048b8d2,
+    0x504938e5,
+    0x5049b8fa,
+    0x504a3910,
+    0x504ab920,
+    0x504b3940,
+    0x504bb953,
+    0x504c3976,
+    0x504cb9a4,
+    0x504d39d1,
+    0x504db9ee,
+    0x504e3a09,
+    0x504eba25,
+    0x504f3a37,
+    0x504fba4e,
+    0x50503a5d,
     0x50508719,
-    0x50513a72,
-    0x5051b810,
-    0x505239b8,
+    0x50513a70,
+    0x5051b80e,
+    0x505239b6,
     0x58320f8d,
     0x68320f4f,
     0x68328ca7,
@@ -790,19 +790,19 @@
     0x7c32121d,
     0x80321433,
     0x80328090,
-    0x80333255,
+    0x80333253,
     0x803380b9,
-    0x80343264,
-    0x8034b1cc,
-    0x803531ea,
-    0x8035b278,
-    0x8036322c,
-    0x8036b1db,
-    0x8037321e,
-    0x8037b1b9,
-    0x8038323f,
-    0x8038b1fb,
-    0x80393210,
+    0x80343262,
+    0x8034b1ca,
+    0x803531e8,
+    0x8035b276,
+    0x8036322a,
+    0x8036b1d9,
+    0x8037321c,
+    0x8037b1b7,
+    0x8038323d,
+    0x8038b1f9,
+    0x8039320e,
 };
 
 const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]);
@@ -1226,6 +1226,7 @@
     "INVALID_ECH_CONFIG_LIST\0"
     "INVALID_ECH_PUBLIC_NAME\0"
     "INVALID_MESSAGE\0"
+    "INVALID_OUTER_EXTENSION\0"
     "INVALID_OUTER_RECORD_TYPE\0"
     "INVALID_SCT_LIST\0"
     "INVALID_SIGNATURE_ALGORITHM\0"
@@ -1269,7 +1270,6 @@
     "OLD_SESSION_CIPHER_NOT_RETURNED\0"
     "OLD_SESSION_PRF_HASH_MISMATCH\0"
     "OLD_SESSION_VERSION_NOT_RETURNED\0"
-    "OUTER_EXTENSION_NOT_FOUND\0"
     "PARSE_TLSEXT\0"
     "PATH_TOO_LONG\0"
     "PEER_DID_NOT_RETURN_A_CERTIFICATE\0"
diff --git a/eureka.mk b/eureka.mk
index ec431f4..93d4437 100644
--- a/eureka.mk
+++ b/eureka.mk
@@ -73,6 +73,7 @@
   src/crypto/cipher_extra/e_aesctrhmac.c\
   src/crypto/cipher_extra/e_aesgcmsiv.c\
   src/crypto/cipher_extra/e_chacha20poly1305.c\
+  src/crypto/cipher_extra/e_des.c\
   src/crypto/cipher_extra/e_null.c\
   src/crypto/cipher_extra/e_rc2.c\
   src/crypto/cipher_extra/e_rc4.c\
@@ -91,6 +92,7 @@
   src/crypto/crypto.c\
   src/crypto/curve25519/curve25519.c\
   src/crypto/curve25519/spake25519.c\
+  src/crypto/des/des.c\
   src/crypto/dh_extra/dh_asn1.c\
   src/crypto/dh_extra/params.c\
   src/crypto/digest_extra/digest_extra.c\
diff --git a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
index a09764a..6ce216f 100644
--- a/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
+++ b/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -25,6 +25,8 @@
 	movl	0(%r11),%r9d
 	movl	4(%r11),%r10d
 	movl	8(%r11),%r11d
+	testl	$536870912,%r11d
+	jnz	.Lshaext_shortcut
 	andl	$1073741824,%r9d
 	andl	$268435968,%r10d
 	orl	%r9d,%r10d
@@ -1781,6 +1783,215 @@
 .long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
 .long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
 .byte	83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type	sha256_block_data_order_shaext,@function
+.align	64
+sha256_block_data_order_shaext:
+.cfi_startproc	
+.Lshaext_shortcut:
+	leaq	K256+128(%rip),%rcx
+	movdqu	(%rdi),%xmm1
+	movdqu	16(%rdi),%xmm2
+	movdqa	512-128(%rcx),%xmm7
+
+	pshufd	$0x1b,%xmm1,%xmm0
+	pshufd	$0xb1,%xmm1,%xmm1
+	pshufd	$0x1b,%xmm2,%xmm2
+	movdqa	%xmm7,%xmm8
+.byte	102,15,58,15,202,8
+	punpcklqdq	%xmm0,%xmm2
+	jmp	.Loop_shaext
+
+.align	16
+.Loop_shaext:
+	movdqu	(%rsi),%xmm3
+	movdqu	16(%rsi),%xmm4
+	movdqu	32(%rsi),%xmm5
+.byte	102,15,56,0,223
+	movdqu	48(%rsi),%xmm6
+
+	movdqa	0-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	102,15,56,0,231
+	movdqa	%xmm2,%xmm10
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	nop
+	movdqa	%xmm1,%xmm9
+.byte	15,56,203,202
+
+	movdqa	32-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	102,15,56,0,239
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	leaq	64(%rsi),%rsi
+.byte	15,56,204,220
+.byte	15,56,203,202
+
+	movdqa	64-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	102,15,56,0,247
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+
+	movdqa	96-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	128-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	160-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	192-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	224-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	256-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	288-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+	nop
+	paddd	%xmm7,%xmm6
+.byte	15,56,204,220
+.byte	15,56,203,202
+	movdqa	320-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,205,245
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm6,%xmm7
+.byte	102,15,58,15,253,4
+	nop
+	paddd	%xmm7,%xmm3
+.byte	15,56,204,229
+.byte	15,56,203,202
+	movdqa	352-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+.byte	15,56,205,222
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm3,%xmm7
+.byte	102,15,58,15,254,4
+	nop
+	paddd	%xmm7,%xmm4
+.byte	15,56,204,238
+.byte	15,56,203,202
+	movdqa	384-128(%rcx),%xmm0
+	paddd	%xmm3,%xmm0
+.byte	15,56,205,227
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm4,%xmm7
+.byte	102,15,58,15,251,4
+	nop
+	paddd	%xmm7,%xmm5
+.byte	15,56,204,243
+.byte	15,56,203,202
+	movdqa	416-128(%rcx),%xmm0
+	paddd	%xmm4,%xmm0
+.byte	15,56,205,236
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	movdqa	%xmm5,%xmm7
+.byte	102,15,58,15,252,4
+.byte	15,56,203,202
+	paddd	%xmm7,%xmm6
+
+	movdqa	448-128(%rcx),%xmm0
+	paddd	%xmm5,%xmm0
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+.byte	15,56,205,245
+	movdqa	%xmm8,%xmm7
+.byte	15,56,203,202
+
+	movdqa	480-128(%rcx),%xmm0
+	paddd	%xmm6,%xmm0
+	nop
+.byte	15,56,203,209
+	pshufd	$0x0e,%xmm0,%xmm0
+	decq	%rdx
+	nop
+.byte	15,56,203,202
+
+	paddd	%xmm10,%xmm2
+	paddd	%xmm9,%xmm1
+	jnz	.Loop_shaext
+
+	pshufd	$0xb1,%xmm2,%xmm2
+	pshufd	$0x1b,%xmm1,%xmm7
+	pshufd	$0xb1,%xmm1,%xmm1
+	punpckhqdq	%xmm2,%xmm1
+.byte	102,15,58,15,215,8
+
+	movdqu	%xmm1,(%rdi)
+	movdqu	%xmm2,16(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
 .type	sha256_block_data_order_ssse3,@function
 .align	64
 sha256_block_data_order_ssse3:
diff --git a/sources.bp b/sources.bp
index 704f514..2f3e684 100644
--- a/sources.bp
+++ b/sources.bp
@@ -75,6 +75,7 @@
         "src/crypto/cipher_extra/e_aesctrhmac.c",
         "src/crypto/cipher_extra/e_aesgcmsiv.c",
         "src/crypto/cipher_extra/e_chacha20poly1305.c",
+        "src/crypto/cipher_extra/e_des.c",
         "src/crypto/cipher_extra/e_null.c",
         "src/crypto/cipher_extra/e_rc2.c",
         "src/crypto/cipher_extra/e_rc4.c",
@@ -93,6 +94,7 @@
         "src/crypto/crypto.c",
         "src/crypto/curve25519/curve25519.c",
         "src/crypto/curve25519/spake25519.c",
+        "src/crypto/des/des.c",
         "src/crypto/dh_extra/dh_asn1.c",
         "src/crypto/dh_extra/params.c",
         "src/crypto/digest_extra/digest_extra.c",
diff --git a/sources.mk b/sources.mk
index ebc49c7..397432e 100644
--- a/sources.mk
+++ b/sources.mk
@@ -73,6 +73,7 @@
   src/crypto/cipher_extra/e_aesctrhmac.c\
   src/crypto/cipher_extra/e_aesgcmsiv.c\
   src/crypto/cipher_extra/e_chacha20poly1305.c\
+  src/crypto/cipher_extra/e_des.c\
   src/crypto/cipher_extra/e_null.c\
   src/crypto/cipher_extra/e_rc2.c\
   src/crypto/cipher_extra/e_rc4.c\
@@ -91,6 +92,7 @@
   src/crypto/crypto.c\
   src/crypto/curve25519/curve25519.c\
   src/crypto/curve25519/spake25519.c\
+  src/crypto/des/des.c\
   src/crypto/dh_extra/dh_asn1.c\
   src/crypto/dh_extra/params.c\
   src/crypto/digest_extra/digest_extra.c\
diff --git a/src/.gitignore b/src/.gitignore
index a8e3184..6cbc9d2 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -24,7 +24,7 @@
 util/bot/perl-win32
 util/bot/perl-win32.zip
 util/bot/sde-linux64
-util/bot/sde-linux64.tar.bz2
+util/bot/sde-linux64.tar.xz
 util/bot/sde-win32
-util/bot/sde-win32.tar.bz2
+util/bot/sde-win32.tar.xz
 util/bot/win_toolchain.json
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f74e233..35ff4c1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -257,10 +257,11 @@
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11")
 endif()
 
-# pthread_rwlock_t on Linux requires a feature flag. However, it should not be
-# set on Apple platforms, where it instead disables APIs we use. See compat(5)
-# and sys/cdefs.h.
-if(NOT WIN32 AND NOT APPLE)
+# pthread_rwlock_t on Linux requires a feature flag. We limit this to Linux
+# because, on Apple platforms, it instead disables APIs we use. See compat(5)
+# and sys/cdefs.h. Reportedly, FreeBSD also breaks when this is set. See
+# https://crbug.com/boringssl/471.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_XOPEN_SOURCE=700")
 endif()
 
@@ -592,7 +593,7 @@
 add_subdirectory(ssl)
 add_subdirectory(ssl/test)
 add_subdirectory(tool)
-add_subdirectory(util/fipstools/cavp)
+add_subdirectory(util/fipstools)
 add_subdirectory(util/fipstools/acvp/modulewrapper)
 add_subdirectory(decrepit)
 
@@ -617,7 +618,7 @@
   endif()
 endif()
 
-if(UNIX AND NOT APPLE AND NOT ANDROID)
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   set(HANDSHAKER_ARGS "-handshaker-path" $<TARGET_FILE:handshaker>)
 endif()
 
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index d9cfa5c..6ab74b8 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -256,6 +256,7 @@
   cipher_extra/e_aesctrhmac.c
   cipher_extra/e_aesgcmsiv.c
   cipher_extra/e_chacha20poly1305.c
+  cipher_extra/e_des.c
   cipher_extra/e_null.c
   cipher_extra/e_rc2.c
   cipher_extra/e_rc4.c
@@ -274,6 +275,7 @@
   crypto.c
   curve25519/curve25519.c
   curve25519/spake25519.c
+  des/des.c
   dh_extra/params.c
   dh_extra/dh_asn1.c
   digest_extra/digest_extra.c
diff --git a/src/crypto/cipher_extra/e_aesgcmsiv.c b/src/crypto/cipher_extra/e_aesgcmsiv.c
index 9e77375..387eaff 100644
--- a/src/crypto/cipher_extra/e_aesgcmsiv.c
+++ b/src/crypto/cipher_extra/e_aesgcmsiv.c
@@ -857,22 +857,15 @@
 
 #if defined(AES_GCM_SIV_ASM)
 
-static char avx_aesni_capable(void) {
-  const uint32_t ecx = OPENSSL_ia32cap_P[1];
-
-  return (ecx & (1 << (57 - 32))) != 0 /* AESNI */ &&
-         (ecx & (1 << 28)) != 0 /* AVX */;
-}
-
 const EVP_AEAD *EVP_aead_aes_128_gcm_siv(void) {
-  if (avx_aesni_capable()) {
+  if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
     return &aead_aes_128_gcm_siv_asm;
   }
   return &aead_aes_128_gcm_siv;
 }
 
 const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) {
-  if (avx_aesni_capable()) {
+  if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) {
     return &aead_aes_256_gcm_siv_asm;
   }
   return &aead_aes_256_gcm_siv;
diff --git a/src/crypto/fipsmodule/cipher/e_des.c b/src/crypto/cipher_extra/e_des.c
similarity index 64%
rename from src/crypto/fipsmodule/cipher/e_des.c
rename to src/crypto/cipher_extra/e_des.c
index e77363b..087029b 100644
--- a/src/crypto/fipsmodule/cipher/e_des.c
+++ b/src/crypto/cipher_extra/e_des.c
@@ -59,7 +59,6 @@
 #include <openssl/nid.h>
 
 #include "internal.h"
-#include "../delocate.h"
 
 
 typedef struct {
@@ -88,17 +87,21 @@
   return 1;
 }
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_cbc) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_cbc;
-  out->block_size = 8;
-  out->key_len = 8;
-  out->iv_len = 8;
-  out->ctx_size = sizeof(EVP_DES_KEY);
-  out->flags = EVP_CIPH_CBC_MODE;
-  out->init = des_init_key;
-  out->cipher = des_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_cbc = {
+    /* nid = */ NID_des_cbc,
+    /* block_size = */ 8,
+    /* key_len = */ 8,
+    /* iv_len = */ 8,
+    /* ctx_size = */ sizeof(EVP_DES_KEY),
+    /* flags = */ EVP_CIPH_CBC_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_init_key,
+    /* cipher = */ des_cbc_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_cbc(void) { return &evp_des_cbc; }
 
 static int des_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in,
                           size_t in_len) {
@@ -107,25 +110,29 @@
   }
   in_len -= ctx->cipher->block_size;
 
-  EVP_DES_KEY *dat = (EVP_DES_KEY *) ctx->cipher_data;
+  EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data;
   for (size_t i = 0; i <= in_len; i += ctx->cipher->block_size) {
-    DES_ecb_encrypt((DES_cblock *) (in + i), (DES_cblock *) (out + i),
+    DES_ecb_encrypt((DES_cblock *)(in + i), (DES_cblock *)(out + i),
                     &dat->ks.ks, ctx->encrypt);
   }
   return 1;
 }
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ecb) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_ecb;
-  out->block_size = 8;
-  out->key_len = 8;
-  out->iv_len = 0;
-  out->ctx_size = sizeof(EVP_DES_KEY);
-  out->flags = EVP_CIPH_ECB_MODE;
-  out->init = des_init_key;
-  out->cipher = des_ecb_cipher;
-}
+static const EVP_CIPHER evp_des_ecb = {
+    /* nid = */ NID_des_ecb,
+    /* block_size = */ 8,
+    /* key_len = */ 8,
+    /* iv_len = */ 0,
+    /* ctx_size = */ sizeof(EVP_DES_KEY),
+    /* flags = */ EVP_CIPH_ECB_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_init_key,
+    /* cipher = */ des_ecb_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ecb(void) { return &evp_des_ecb; }
 
 typedef struct {
   union {
@@ -137,7 +144,7 @@
 static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
                              const uint8_t *iv, int enc) {
   DES_cblock *deskey = (DES_cblock *)key;
-  DES_EDE_KEY *dat = (DES_EDE_KEY*) ctx->cipher_data;
+  DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
 
   DES_set_key(&deskey[0], &dat->ks.ks[0]);
   DES_set_key(&deskey[1], &dat->ks.ks[1]);
@@ -147,8 +154,8 @@
 }
 
 static int des_ede3_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
-                              const uint8_t *in, size_t in_len) {
-  DES_EDE_KEY *dat = (DES_EDE_KEY*) ctx->cipher_data;
+                               const uint8_t *in, size_t in_len) {
+  DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
 
   DES_ede3_cbc_encrypt(in, out, in_len, &dat->ks.ks[0], &dat->ks.ks[1],
                        &dat->ks.ks[2], (DES_cblock *)ctx->iv, ctx->encrypt);
@@ -156,22 +163,26 @@
   return 1;
 }
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede3_cbc) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_ede3_cbc;
-  out->block_size = 8;
-  out->key_len = 24;
-  out->iv_len = 8;
-  out->ctx_size = sizeof(DES_EDE_KEY);
-  out->flags = EVP_CIPH_CBC_MODE;
-  out->init = des_ede3_init_key;
-  out->cipher = des_ede3_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_ede3_cbc = {
+    /* nid = */ NID_des_ede3_cbc,
+    /* block_size = */ 8,
+    /* key_len = */ 24,
+    /* iv_len = */ 8,
+    /* ctx_size = */ sizeof(DES_EDE_KEY),
+    /* flags = */ EVP_CIPH_CBC_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_ede3_init_key,
+    /* cipher = */ des_ede3_cbc_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede3_cbc(void) { return &evp_des_ede3_cbc; }
 
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key,
-                             const uint8_t *iv, int enc) {
-  DES_cblock *deskey = (DES_cblock *) key;
-  DES_EDE_KEY *dat = (DES_EDE_KEY *) ctx->cipher_data;
+                            const uint8_t *iv, int enc) {
+  DES_cblock *deskey = (DES_cblock *)key;
+  DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data;
 
   DES_set_key(&deskey[0], &dat->ks.ks[0]);
   DES_set_key(&deskey[1], &dat->ks.ks[1]);
@@ -180,17 +191,21 @@
   return 1;
 }
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede_cbc) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_ede_cbc;
-  out->block_size = 8;
-  out->key_len = 16;
-  out->iv_len = 8;
-  out->ctx_size = sizeof(DES_EDE_KEY);
-  out->flags = EVP_CIPH_CBC_MODE;
-  out->init = des_ede_init_key;
-  out->cipher = des_ede3_cbc_cipher;
-}
+static const EVP_CIPHER evp_des_ede_cbc = {
+    /* nid = */ NID_des_ede_cbc,
+    /* block_size = */ 8,
+    /* key_len = */ 16,
+    /* iv_len = */ 8,
+    /* ctx_size = */ sizeof(DES_EDE_KEY),
+    /* flags = */ EVP_CIPH_CBC_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_ede_init_key,
+    /* cipher = */ des_ede3_cbc_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede_cbc(void) { return &evp_des_ede_cbc; }
 
 static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out,
                               const uint8_t *in, size_t in_len) {
@@ -208,30 +223,36 @@
   return 1;
 }
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_ede_ecb;
-  out->block_size = 8;
-  out->key_len = 16;
-  out->iv_len = 0;
-  out->ctx_size = sizeof(DES_EDE_KEY);
-  out->flags = EVP_CIPH_ECB_MODE;
-  out->init = des_ede_init_key;
-  out->cipher = des_ede_ecb_cipher;
-}
+static const EVP_CIPHER evp_des_ede = {
+    /* nid = */ NID_des_ede_ecb,
+    /* block_size = */ 8,
+    /* key_len = */ 16,
+    /* iv_len = */ 0,
+    /* ctx_size = */ sizeof(DES_EDE_KEY),
+    /* flags = */ EVP_CIPH_ECB_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_ede_init_key,
+    /* cipher = */ des_ede_ecb_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
 
-DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_des_ede3) {
-  memset(out, 0, sizeof(EVP_CIPHER));
-  out->nid = NID_des_ede3_ecb;
-  out->block_size = 8;
-  out->key_len = 24;
-  out->iv_len = 0;
-  out->ctx_size = sizeof(DES_EDE_KEY);
-  out->flags = EVP_CIPH_ECB_MODE;
-  out->init = des_ede3_init_key;
-  out->cipher = des_ede_ecb_cipher;
-}
+const EVP_CIPHER *EVP_des_ede(void) { return &evp_des_ede; }
 
-const EVP_CIPHER* EVP_des_ede3_ecb(void) {
-  return EVP_des_ede3();
-}
+static const EVP_CIPHER evp_des_ede3 = {
+    /* nid = */ NID_des_ede3_ecb,
+    /* block_size = */ 8,
+    /* key_len = */ 24,
+    /* iv_len = */ 0,
+    /* ctx_size = */ sizeof(DES_EDE_KEY),
+    /* flags = */ EVP_CIPH_ECB_MODE,
+    /* app_data = */ NULL,
+    /* init = */ des_ede3_init_key,
+    /* cipher = */ des_ede_ecb_cipher,
+    /* cleanup = */ NULL,
+    /* ctrl = */ NULL,
+};
+
+const EVP_CIPHER *EVP_des_ede3(void) { return &evp_des_ede3; }
+
+const EVP_CIPHER *EVP_des_ede3_ecb(void) { return EVP_des_ede3(); }
diff --git a/src/crypto/cipher_extra/internal.h b/src/crypto/cipher_extra/internal.h
index 0f5f566..4e8fa46 100644
--- a/src/crypto/cipher_extra/internal.h
+++ b/src/crypto/cipher_extra/internal.h
@@ -171,8 +171,7 @@
                       "wrong chacha20_poly1305_seal_data size");
 
 OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) {
-  const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
-  return sse41_capable;
+  return CRYPTO_is_SSE4_1_capable();
 }
 
 // chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts
diff --git a/src/crypto/curve25519/curve25519.c b/src/crypto/curve25519/curve25519.c
index 64aa1e6..7cb0add 100644
--- a/src/crypto/curve25519/curve25519.c
+++ b/src/crypto/curve25519/curve25519.c
@@ -502,27 +502,21 @@
 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
   fe u;
   fe_loose v;
-  fe v3;
+  fe w;
   fe vxx;
   fe_loose check;
 
   fe_frombytes(&h->Y, s);
   fe_1(&h->Z);
-  fe_sq_tt(&v3, &h->Y);
-  fe_mul_ttt(&vxx, &v3, &d);
-  fe_sub(&v, &v3, &h->Z);  // u = y^2-1
+  fe_sq_tt(&w, &h->Y);
+  fe_mul_ttt(&vxx, &w, &d);
+  fe_sub(&v, &w, &h->Z);  // u = y^2-1
   fe_carry(&u, &v);
   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
 
-  fe_sq_tl(&v3, &v);
-  fe_mul_ttl(&v3, &v3, &v);  // v3 = v^3
-  fe_sq_tt(&h->X, &v3);
-  fe_mul_ttl(&h->X, &h->X, &v);
-  fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^7
-
-  fe_pow22523(&h->X, &h->X);  // x = (uv^7)^((q-5)/8)
-  fe_mul_ttt(&h->X, &h->X, &v3);
-  fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^3(uv^7)^((q-5)/8)
+  fe_mul_ttl(&w, &u, &v);  // w = u*v
+  fe_pow22523(&h->X, &w);  // x = w^((q-5)/8)
+  fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
 
   fe_sq_tt(&vxx, &h->X);
   fe_mul_ttl(&vxx, &vxx, &v);
diff --git a/src/crypto/fipsmodule/des/des.c b/src/crypto/des/des.c
similarity index 100%
rename from src/crypto/fipsmodule/des/des.c
rename to src/crypto/des/des.c
diff --git a/src/crypto/fipsmodule/des/internal.h b/src/crypto/des/internal.h
similarity index 99%
rename from src/crypto/fipsmodule/des/internal.h
rename to src/crypto/des/internal.h
index 3e3992e..2124fd5 100644
--- a/src/crypto/fipsmodule/des/internal.h
+++ b/src/crypto/des/internal.h
@@ -59,7 +59,7 @@
 
 #include <openssl/base.h>
 
-#include "../../internal.h"
+#include "../internal.h"
 
 #if defined(__cplusplus)
 extern "C" {
diff --git a/src/crypto/err/ssl.errordata b/src/crypto/err/ssl.errordata
index 6879134..4205402 100644
--- a/src/crypto/err/ssl.errordata
+++ b/src/crypto/err/ssl.errordata
@@ -90,6 +90,7 @@
 SSL,318,INVALID_ECH_CONFIG_LIST
 SSL,317,INVALID_ECH_PUBLIC_NAME
 SSL,159,INVALID_MESSAGE
+SSL,320,INVALID_OUTER_EXTENSION
 SSL,251,INVALID_OUTER_RECORD_TYPE
 SSL,269,INVALID_SCT_LIST
 SSL,295,INVALID_SIGNATURE_ALGORITHM
@@ -133,7 +134,6 @@
 SSL,187,OLD_SESSION_CIPHER_NOT_RETURNED
 SSL,268,OLD_SESSION_PRF_HASH_MISMATCH
 SSL,188,OLD_SESSION_VERSION_NOT_RETURNED
-SSL,320,OUTER_EXTENSION_NOT_FOUND
 SSL,189,OUTPUT_ALIASES_INPUT
 SSL,190,PARSE_TLSEXT
 SSL,191,PATH_TOO_LONG
diff --git a/src/crypto/fipsmodule/FIPS.md b/src/crypto/fipsmodule/FIPS.md
index d3b3890..bc5708f 100644
--- a/src/crypto/fipsmodule/FIPS.md
+++ b/src/crypto/fipsmodule/FIPS.md
@@ -12,30 +12,21 @@
 1. 2018-07-30: certificate [#3318](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3318), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20180730.docx) (in docx format).
 1. 2019-08-08: certificate [#3678](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3678), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Security-Policy-20190808.docx) (in docx format).
 1. 2019-10-20: certificate [#3753](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/3753), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20191020.docx) (in docx format).
+1. 2021-01-28: certificate [#4156](https://csrc.nist.gov/Projects/Cryptographic-Module-Validation-Program/Certificate/4156), [security policy](/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx) (in docx format).
 
-## Running CAVP tests
+## Running ACVP tests
 
-CAVP results are calculated by `util/fipstools/cavp`, but that binary is almost always run by `util/fipstools/run_cavp.go`. The latter knows the set of tests to be processed and the flags needed to configure `cavp` for each one. It must be run from the top of a CAVP directory and needs the following options:
+See `util/fipstools/acvp/ACVP.md` for details of how ACVP testing is done.
 
-1. `-oracle-bin`: points to the location of `util/fipstools/cavp`
-2. `-no-fax`: this is needed to suppress checking of the FAX files, which are only included in sample sets.
+## Breaking known-answer and continuous tests
 
-## Breaking power-on and continuous tests
+Each known-answer test (KAT) uses a unique, random input value. `util/fipstools/break-kat.go` contains a listing of those values and can be used to corrupt a given test in a binary. Since changes to the KAT input values will invalidate the integrity test, `BORINGSSL_FIPS_BREAK_TESTS` can be defined in `fips_break_tests.h` to disable it for the purposes of testing.
 
-In order to demonstrate failures of the various FIPS 140 tests, BoringSSL can be built in ways that will trigger such failures. This is controlled by passing `-DFIPS_BREAK_TEST=`(test to break) to CMake, where the following tests can be specified:
+Some FIPS tests cannot be broken by replacing a known string in the binary. For those, when `BORINGSSL_FIPS_BREAK_TESTS` is defined, the environment variable `BORINGSSL_FIPS_BREAK_TEST` can be set to one of a number of values in order to break the corresponding test:
 
-1. AES\_CBC
-1. AES\_GCM
-1. DES
-1. SHA\_1
-1. SHA\_256
-1. SHA\_512
-1. RSA\_SIG
-1. ECDSA\_SIG
-1. DRBG
-1. RSA\_PWCT
-1. ECDSA\_PWCT
-1. TLS\_KDF
+1. `RSA_PWCT`
+1. `ECDSA_PWCT`
+1. `CRNG`
 
 ## Breaking the integrity test
 
@@ -61,12 +52,6 @@
 
 FIPS requires that RNG state be zeroed when the process exits. In order to implement this, all per-thread RNG states are tracked in a linked list and a destructor function is included which clears them. In order for this to be safe in the presence of threads, a lock is used to stop all other threads from using the RNG once this process has begun. Thus the main thread exiting may cause other threads to deadlock, and drawing on entropy in a destructor function may also deadlock.
 
-## Self-test optimisation
-
-On Android, the self-tests are optimised in line with [IG](https://csrc.nist.gov/csrc/media/projects/cryptographic-module-validation-program/documents/fips140-2/fips1402ig.pdf) section 9.11. The module will always perform the integrity test at power-on, but the self-tests will test for the presence of a file named after the hex encoded, HMAC-SHA-256 hash of the module in `/dev/boringssl/selftest/`. If such a file is found then the self-tests are skipped. Otherwise, after the self-tests complete successfully, that file will be written. Any I/O errors are ignored and, if they occur when testing for the presence of the file, the module acts as if it's not present.
-
-It is intended that a `tmpfs` be mounted at that location in order to skip running the self tests for every process once they have already passed in a given instance of the operating system.
-
 ## Integrity Test
 
 FIPS-140 mandates that a module calculate an HMAC of its own code in a constructor function and compare the result to a known-good value. Typical code produced by a C compiler includes large numbers of relocations: places in the machine code where the linker needs to resolve and inject the final value of a symbolic expression. These relocations mean that the bytes that make up any specific bit of code generally aren't known until the final link has completed.
diff --git a/src/crypto/fipsmodule/aes/internal.h b/src/crypto/fipsmodule/aes/internal.h
index 9f7dd47..0685bc4 100644
--- a/src/crypto/fipsmodule/aes/internal.h
+++ b/src/crypto/fipsmodule/aes/internal.h
@@ -30,18 +30,14 @@
 #define HWAES
 #define HWAES_ECB
 
-OPENSSL_INLINE int hwaes_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (57 - 32))) != 0;
-}
+OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_AESNI_capable(); }
 
 #define VPAES
 #if defined(OPENSSL_X86_64)
 #define VPAES_CTR32
 #endif
 #define VPAES_CBC
-OPENSSL_INLINE int vpaes_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
+OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_SSSE3_capable(); }
 
 #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 #define HWAES
diff --git a/src/crypto/fipsmodule/bcm.c b/src/crypto/fipsmodule/bcm.c
index 639235e..1219bc7 100644
--- a/src/crypto/fipsmodule/bcm.c
+++ b/src/crypto/fipsmodule/bcm.c
@@ -58,8 +58,6 @@
 #include "cipher/aead.c"
 #include "cipher/cipher.c"
 #include "cipher/e_aes.c"
-#include "cipher/e_des.c"
-#include "des/des.c"
 #include "dh/check.c"
 #include "dh/dh.c"
 #include "digest/digest.c"
@@ -192,16 +190,23 @@
 #endif
 
   assert_within(rodata_start, kPrimes, rodata_end);
-  assert_within(rodata_start, des_skb, rodata_end);
   assert_within(rodata_start, kP256Params, rodata_end);
   assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end);
 
 #if defined(OPENSSL_AARCH64) || defined(OPENSSL_ANDROID)
   uint8_t result[SHA256_DIGEST_LENGTH];
   const EVP_MD *const kHashFunction = EVP_sha256();
+  if (!boringssl_self_test_sha256() ||
+      !boringssl_self_test_hmac_sha256()) {
+    goto err;
+  }
 #else
   uint8_t result[SHA512_DIGEST_LENGTH];
   const EVP_MD *const kHashFunction = EVP_sha512();
+  if (!boringssl_self_test_sha512() ||
+      !boringssl_self_test_hmac_sha256()) {
+    goto err;
+  }
 #endif
 
   static const uint8_t kHMACKey[64] = {0};
@@ -238,20 +243,18 @@
   const uint8_t *expected = BORINGSSL_bcm_text_hash;
 
   if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) {
+#if !defined(BORINGSSL_FIPS_BREAK_TESTS)
     goto err;
+#endif
   }
 
   OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10.
-
-  if (!boringssl_fips_self_test(BORINGSSL_bcm_text_hash, sizeof(result))) {
-    goto err;
-  }
-#else
-  if (!BORINGSSL_self_test()) {
-    goto err;
-  }
 #endif  // OPENSSL_ASAN
 
+  if (!boringssl_self_test_startup()) {
+    goto err;
+  }
+
   return;
 
 err:
diff --git a/src/crypto/fipsmodule/bn/rsaz_exp.h b/src/crypto/fipsmodule/bn/rsaz_exp.h
index 2f0c2c0..104bb7a 100644
--- a/src/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/src/crypto/fipsmodule/bn/rsaz_exp.h
@@ -41,18 +41,17 @@
                             BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]);
 
 OPENSSL_INLINE int rsaz_avx2_capable(void) {
-  const uint32_t *cap = OPENSSL_ia32cap_get();
-  return (cap[2] & (1 << 5)) != 0;  // AVX2
+  return CRYPTO_is_AVX2_capable();
 }
 
 OPENSSL_INLINE int rsaz_avx2_preferred(void) {
-  const uint32_t *cap = OPENSSL_ia32cap_get();
-  static const uint32_t kBMI2AndADX = (1 << 8) | (1 << 19);
-  if ((cap[2] & kBMI2AndADX) == kBMI2AndADX) {
-    // If BMI2 and ADX are available, x86_64-mont5.pl is faster.
+  if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() &&
+      CRYPTO_is_ADX_capable()) {
+    // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the
+    // .Lmulx4x_enter and .Lpowerx5_enter branches.
     return 0;
   }
-  return (cap[2] & (1 << 5)) != 0;  // AVX2
+  return CRYPTO_is_AVX2_capable();
 }
 
 
diff --git a/src/crypto/fipsmodule/dh/dh.c b/src/crypto/fipsmodule/dh/dh.c
index ab596e9..b59afc6 100644
--- a/src/crypto/fipsmodule/dh/dh.c
+++ b/src/crypto/fipsmodule/dh/dh.c
@@ -64,6 +64,7 @@
 #include <openssl/mem.h>
 #include <openssl/thread.h>
 
+#include "internal.h"
 #include "../../internal.h"
 #include "../bn/internal.h"
 
@@ -186,6 +187,8 @@
 }
 
 int DH_generate_key(DH *dh) {
+  boringssl_ensure_ffdh_self_test();
+
   int ok = 0;
   int generate_new_key = 0;
   BN_CTX *ctx = NULL;
@@ -322,7 +325,8 @@
   return ret;
 }
 
-int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+int dh_compute_key_padded_no_self_test(unsigned char *out,
+                                       const BIGNUM *peers_key, DH *dh) {
   BN_CTX *ctx = BN_CTX_new();
   if (ctx == NULL) {
     return -1;
@@ -343,7 +347,15 @@
   return ret;
 }
 
+int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+  boringssl_ensure_ffdh_self_test();
+
+  return dh_compute_key_padded_no_self_test(out, peers_key, dh);
+}
+
 int DH_compute_key(unsigned char *out, const BIGNUM *peers_key, DH *dh) {
+  boringssl_ensure_ffdh_self_test();
+
   BN_CTX *ctx = BN_CTX_new();
   if (ctx == NULL) {
     return -1;
diff --git a/src/crypto/fipsmodule/dh/internal.h b/src/crypto/fipsmodule/dh/internal.h
new file mode 100644
index 0000000..c40172d
--- /dev/null
+++ b/src/crypto/fipsmodule/dh/internal.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
+#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+// dh_compute_key_padded_no_self_test does the same as |DH_compute_key_padded|,
+// but doesn't try to run the self-test first. This is for use in the self tests
+// themselves, to prevent an infinite loop.
+int dh_compute_key_padded_no_self_test(unsigned char *out,
+                                       const BIGNUM *peers_key, DH *dh);
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H
diff --git a/src/crypto/fipsmodule/ec/ec.c b/src/crypto/fipsmodule/ec/ec.c
index 1f03e15..93fdcfc 100644
--- a/src/crypto/fipsmodule/ec/ec.c
+++ b/src/crypto/fipsmodule/ec/ec.c
@@ -943,8 +943,9 @@
   return ok;
 }
 
-int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
-                 const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) {
+int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r,
+                              const BIGNUM *g_scalar, const EC_POINT *p,
+                              const BIGNUM *p_scalar, BN_CTX *ctx) {
   // Previously, this function set |r| to the point at infinity if there was
   // nothing to multiply. But, nobody should be calling this function with
   // nothing to multiply in the first place.
@@ -1010,6 +1011,13 @@
   return ret;
 }
 
+int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar,
+                 const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) {
+  boringssl_ensure_ecc_self_test();
+
+  return ec_point_mul_no_self_test(group, r, g_scalar, p, p_scalar, ctx);
+}
+
 int ec_point_mul_scalar_public(const EC_GROUP *group, EC_RAW_POINT *r,
                                const EC_SCALAR *g_scalar, const EC_RAW_POINT *p,
                                const EC_SCALAR *p_scalar) {
diff --git a/src/crypto/fipsmodule/ec/ec_key.c b/src/crypto/fipsmodule/ec/ec_key.c
index 7a6daab..d7acf96 100644
--- a/src/crypto/fipsmodule/ec/ec_key.c
+++ b/src/crypto/fipsmodule/ec/ec_key.c
@@ -339,9 +339,9 @@
   if (key->priv_key) {
     uint8_t data[16] = {0};
     ECDSA_SIG *sig = ECDSA_do_sign(data, sizeof(data), key);
-#if defined(BORINGSSL_FIPS_BREAK_ECDSA_PWCT)
-    data[0] = ~data[0];
-#endif
+    if (boringssl_fips_break_test("ECDSA_PWCT")) {
+      data[0] = ~data[0];
+    }
     int ok = sig != NULL &&
              ECDSA_do_verify(data, sizeof(data), sig, key);
     ECDSA_SIG_free(sig);
@@ -439,6 +439,8 @@
 }
 
 int EC_KEY_generate_key_fips(EC_KEY *eckey) {
+  boringssl_ensure_ecc_self_test();
+
   if (EC_KEY_generate_key(eckey) && EC_KEY_check_fips(eckey)) {
     return 1;
   }
diff --git a/src/crypto/fipsmodule/ec/internal.h b/src/crypto/fipsmodule/ec/internal.h
index 289c3aa..488adb8 100644
--- a/src/crypto/fipsmodule/ec/internal.h
+++ b/src/crypto/fipsmodule/ec/internal.h
@@ -301,6 +301,13 @@
 int ec_point_set_affine_coordinates(const EC_GROUP *group, EC_AFFINE *out,
                                     const EC_FELEM *x, const EC_FELEM *y);
 
+// ec_point_mul_no_self_test does the same as |EC_POINT_mul|, but doesn't try to
+// run the self-test first. This is for use in the self tests themselves, to
+// prevent an infinite loop.
+int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r,
+                              const BIGNUM *g_scalar, const EC_POINT *p,
+                              const BIGNUM *p_scalar, BN_CTX *ctx);
+
 // ec_point_mul_scalar sets |r| to |p| * |scalar|. Both inputs are considered
 // secret.
 int ec_point_mul_scalar(const EC_GROUP *group, EC_RAW_POINT *r,
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64.c b/src/crypto/fipsmodule/ec/p256-x86_64.c
index 99deb36..506b7d2 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64.c
+++ b/src/crypto/fipsmodule/ec/p256-x86_64.c
@@ -554,7 +554,7 @@
 static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group,
                                                  EC_SCALAR *out,
                                                  const EC_SCALAR *in) {
-  if ((OPENSSL_ia32cap_get()[1] & (1 << 28)) == 0) {
+  if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; fallback to generic code.
     return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in);
   }
diff --git a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
index a083f3d..f6f070a 100644
--- a/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
+++ b/src/crypto/fipsmodule/ec/p256-x86_64_test.cc
@@ -98,7 +98,7 @@
 }
 
 TEST(P256_X86_64Test, BEEU) {
-  if ((OPENSSL_ia32cap_P[1] & (1 << 28)) == 0) {
+  if (!CRYPTO_is_AVX_capable()) {
     // No AVX support; cannot run the BEEU code.
     return;
   }
diff --git a/src/crypto/fipsmodule/ecdh/ecdh.c b/src/crypto/fipsmodule/ecdh/ecdh.c
index 4e6d0bf..36fbadc 100644
--- a/src/crypto/fipsmodule/ecdh/ecdh.c
+++ b/src/crypto/fipsmodule/ecdh/ecdh.c
@@ -75,10 +75,13 @@
 #include <openssl/sha.h>
 
 #include "../ec/internal.h"
+#include "../../internal.h"
 
 
 int ECDH_compute_key_fips(uint8_t *out, size_t out_len, const EC_POINT *pub_key,
                           const EC_KEY *priv_key) {
+  boringssl_ensure_ecc_self_test();
+
   if (priv_key->priv_key == NULL) {
     OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE);
     return 0;
diff --git a/src/crypto/fipsmodule/ecdsa/ecdsa.c b/src/crypto/fipsmodule/ecdsa/ecdsa.c
index 5d99903..db0c6e5 100644
--- a/src/crypto/fipsmodule/ecdsa/ecdsa.c
+++ b/src/crypto/fipsmodule/ecdsa/ecdsa.c
@@ -151,8 +151,8 @@
   return 1;
 }
 
-int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
-                    const ECDSA_SIG *sig, const EC_KEY *eckey) {
+int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
+                                 const ECDSA_SIG *sig, const EC_KEY *eckey) {
   const EC_GROUP *group = EC_KEY_get0_group(eckey);
   const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey);
   if (group == NULL || pub_key == NULL || sig == NULL) {
@@ -198,6 +198,13 @@
   return 1;
 }
 
+int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
+                    const ECDSA_SIG *sig, const EC_KEY *eckey) {
+  boringssl_ensure_ecc_self_test();
+
+  return ecdsa_do_verify_no_self_test(digest, digest_len, sig, eckey);
+}
+
 static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry,
                                   const EC_SCALAR *priv_key, const EC_SCALAR *k,
                                   const uint8_t *digest, size_t digest_len) {
@@ -292,12 +299,16 @@
 ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing(
     const uint8_t *digest, size_t digest_len, const EC_KEY *eckey,
     const uint8_t *nonce, size_t nonce_len) {
+  boringssl_ensure_ecc_self_test();
+
   return ecdsa_sign_with_nonce_for_known_answer_test(digest, digest_len, eckey,
                                                      nonce, nonce_len);
 }
 
 ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
                          const EC_KEY *eckey) {
+  boringssl_ensure_ecc_self_test();
+
   if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
     OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED);
     return NULL;
diff --git a/src/crypto/fipsmodule/ecdsa/internal.h b/src/crypto/fipsmodule/ecdsa/internal.h
index 5115dfa..645959f 100644
--- a/src/crypto/fipsmodule/ecdsa/internal.h
+++ b/src/crypto/fipsmodule/ecdsa/internal.h
@@ -31,6 +31,12 @@
                                                        const uint8_t *nonce,
                                                        size_t nonce_len);
 
+// ecdsa_do_verify_no_self_test does the same as |ECDSA_do_verify|, but doesn't
+// try to run the self-test first. This is for use in the self tests themselves,
+// to prevent an infinite loop.
+int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len,
+                                 const ECDSA_SIG *sig, const EC_KEY *eckey);
+
 
 #if defined(__cplusplus)
 }
diff --git a/src/crypto/fipsmodule/modes/gcm.c b/src/crypto/fipsmodule/modes/gcm.c
index 28218b4..5b909aa 100644
--- a/src/crypto/fipsmodule/modes/gcm.c
+++ b/src/crypto/fipsmodule/modes/gcm.c
@@ -152,7 +152,7 @@
 
 #if defined(GHASH_ASM_X86_64)
   if (crypto_gcm_clmul_enabled()) {
-    if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
       gcm_init_avx(out_table, H.u);
       *out_mult = gcm_gmult_avx;
       *out_hash = gcm_ghash_avx;
@@ -164,7 +164,7 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     gcm_init_ssse3(out_table, H.u);
     *out_mult = gcm_gmult_ssse3;
     *out_hash = gcm_ghash_ssse3;
@@ -177,7 +177,7 @@
     *out_hash = gcm_ghash_clmul;
     return;
   }
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     gcm_init_ssse3(out_table, H.u);
     *out_mult = gcm_gmult_ssse3;
     *out_hash = gcm_ghash_ssse3;
@@ -722,9 +722,7 @@
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
 int crypto_gcm_clmul_enabled(void) {
 #if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
-  const uint32_t *ia32cap = OPENSSL_ia32cap_get();
-  return (ia32cap[0] & (1 << 24)) &&  // check FXSR bit
-         (ia32cap[1] & (1 << 1));     // check PCLMULQDQ bit
+  return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable();
 #else
   return 0;
 #endif
diff --git a/src/crypto/fipsmodule/modes/gcm_test.cc b/src/crypto/fipsmodule/modes/gcm_test.cc
index 539b764..d66d8ae 100644
--- a/src/crypto/fipsmodule/modes/gcm_test.cc
+++ b/src/crypto/fipsmodule/modes/gcm_test.cc
@@ -136,7 +136,7 @@
 
   alignas(16) u128 Htable[16];
 #if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64)
-  if (gcm_ssse3_capable()) {
+  if (CRYPTO_is_SSSE3_capable()) {
     CHECK_ABI_SEH(gcm_init_ssse3, Htable, kH);
     CHECK_ABI_SEH(gcm_gmult_ssse3, X, Htable);
     for (size_t blocks : kBlockCounts) {
@@ -152,7 +152,7 @@
     }
 
 #if defined(GHASH_ASM_X86_64)
-    if (((OPENSSL_ia32cap_get()[1] >> 22) & 0x41) == 0x41) {  // AVX+MOVBE
+    if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) {
       CHECK_ABI_SEH(gcm_init_avx, Htable, kH);
       CHECK_ABI_SEH(gcm_gmult_avx, X, Htable);
       for (size_t blocks : kBlockCounts) {
diff --git a/src/crypto/fipsmodule/modes/internal.h b/src/crypto/fipsmodule/modes/internal.h
index f022f9b..0164aac 100644
--- a/src/crypto/fipsmodule/modes/internal.h
+++ b/src/crypto/fipsmodule/modes/internal.h
@@ -253,10 +253,6 @@
 void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp,
                      size_t len);
 
-OPENSSL_INLINE char gcm_ssse3_capable(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1 << (41 - 32))) != 0;
-}
-
 // |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be
 // 16-byte-aligned, but |gcm_init_ssse3| does not.
 void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]);
diff --git a/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx b/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx
new file mode 100644
index 0000000..17fcd25
--- /dev/null
+++ b/src/crypto/fipsmodule/policydocs/BoringCrypto-Android-Security-Policy-20210319.docx
Binary files differ
diff --git a/src/crypto/fipsmodule/rand/internal.h b/src/crypto/fipsmodule/rand/internal.h
index bbeef76..eccf047 100644
--- a/src/crypto/fipsmodule/rand/internal.h
+++ b/src/crypto/fipsmodule/rand/internal.h
@@ -143,15 +143,14 @@
 #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
 
 OPENSSL_INLINE int have_rdrand(void) {
-  return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+  return CRYPTO_is_RDRAND_capable();
 }
 
 // have_fast_rdrand returns true if RDRAND is supported and it's reasonably
 // fast. Concretely the latter is defined by whether the chip is Intel (fast) or
 // not (assumed slow).
 OPENSSL_INLINE int have_fast_rdrand(void) {
-  const uint32_t *const ia32cap = OPENSSL_ia32cap_get();
-  return (ia32cap[1] & (1u << 30)) && (ia32cap[0] & (1u << 30));
+  return CRYPTO_is_RDRAND_capable() && CRYPTO_is_intel_cpu();
 }
 
 // CRYPTO_rdrand writes eight bytes of random data from the hardware RNG to
diff --git a/src/crypto/fipsmodule/rand/rand.c b/src/crypto/fipsmodule/rand/rand.c
index 9c54fc5..357be39 100644
--- a/src/crypto/fipsmodule/rand/rand.c
+++ b/src/crypto/fipsmodule/rand/rand.c
@@ -170,11 +170,11 @@
     CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len);
   }
 
-#if defined(BORINGSSL_FIPS_BREAK_CRNG)
-  // This breaks the "continuous random number generator test" defined in FIPS
-  // 140-2, section 4.9.2, and implemented in |rand_get_seed|.
-  OPENSSL_memset(out_entropy, 0, out_entropy_len);
-#endif
+  if (boringssl_fips_break_test("CRNG")) {
+    // This breaks the "continuous random number generator test" defined in FIPS
+    // 140-2, section 4.9.2, and implemented in |rand_get_seed|.
+    OPENSSL_memset(out_entropy, 0, out_entropy_len);
+  }
 }
 
 // In passive entropy mode, entropy is supplied from outside of the module via
diff --git a/src/crypto/fipsmodule/rsa/internal.h b/src/crypto/fipsmodule/rsa/internal.h
index d9d6fac..1cb3b5f 100644
--- a/src/crypto/fipsmodule/rsa/internal.h
+++ b/src/crypto/fipsmodule/rsa/internal.h
@@ -124,6 +124,28 @@
 extern const size_t kBoringSSLRSASqrtTwoLen;
 
 
+// Functions that avoid self-tests.
+//
+// Self-tests need to call functions that don't try and ensure that the
+// self-tests have passed. These functions, in turn, need to limit themselves
+// to such functions too.
+//
+// These functions are the same as their public versions, but skip the self-test
+// check.
+
+int rsa_verify_no_self_test(int hash_nid, const uint8_t *digest,
+                            size_t digest_len, const uint8_t *sig,
+                            size_t sig_len, RSA *rsa);
+
+int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+                                size_t max_out, const uint8_t *in,
+                                size_t in_len, int padding);
+
+int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest,
+                          unsigned digest_len, uint8_t *out, unsigned *out_len,
+                          RSA *rsa);
+
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
diff --git a/src/crypto/fipsmodule/rsa/rsa.c b/src/crypto/fipsmodule/rsa/rsa.c
index 3205d7d..733e7fa 100644
--- a/src/crypto/fipsmodule/rsa/rsa.c
+++ b/src/crypto/fipsmodule/rsa/rsa.c
@@ -303,8 +303,9 @@
   return out_len;
 }
 
-int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
-                 const uint8_t *in, size_t in_len, int padding) {
+static int rsa_sign_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+                                     size_t max_out, const uint8_t *in,
+                                     size_t in_len, int padding) {
   if (rsa->meth->sign_raw) {
     return rsa->meth->sign_raw(rsa, out_len, out, max_out, in, in_len, padding);
   }
@@ -312,6 +313,13 @@
   return rsa_default_sign_raw(rsa, out_len, out, max_out, in, in_len, padding);
 }
 
+int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
+                 const uint8_t *in, size_t in_len, int padding) {
+  boringssl_ensure_rsa_self_test();
+  return rsa_sign_raw_no_self_test(rsa, out_len, out, max_out, in, in_len,
+                                   padding);
+}
+
 int RSA_private_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa,
                         int padding) {
   size_t out_len;
@@ -523,8 +531,9 @@
   return 0;
 }
 
-int RSA_sign(int hash_nid, const uint8_t *digest, unsigned digest_len,
-             uint8_t *out, unsigned *out_len, RSA *rsa) {
+int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest,
+                          unsigned digest_len, uint8_t *out, unsigned *out_len,
+                          RSA *rsa) {
   const unsigned rsa_size = RSA_size(rsa);
   int ret = 0;
   uint8_t *signed_msg = NULL;
@@ -539,8 +548,9 @@
   if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
                             &signed_msg_is_alloced, hash_nid, digest,
                             digest_len) ||
-      !RSA_sign_raw(rsa, &size_t_out_len, out, rsa_size, signed_msg,
-                    signed_msg_len, RSA_PKCS1_PADDING)) {
+      !rsa_sign_raw_no_self_test(rsa, &size_t_out_len, out, rsa_size,
+                                 signed_msg, signed_msg_len,
+                                 RSA_PKCS1_PADDING)) {
     goto err;
   }
 
@@ -554,6 +564,13 @@
   return ret;
 }
 
+int RSA_sign(int hash_nid, const uint8_t *digest, unsigned digest_len,
+             uint8_t *out, unsigned *out_len, RSA *rsa) {
+  boringssl_ensure_rsa_self_test();
+
+  return rsa_sign_no_self_test(hash_nid, digest, digest_len, out, out_len, rsa);
+}
+
 int RSA_sign_pss_mgf1(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
                       const uint8_t *digest, size_t digest_len,
                       const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len) {
@@ -577,8 +594,9 @@
   return ret;
 }
 
-int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len,
-               const uint8_t *sig, size_t sig_len, RSA *rsa) {
+int rsa_verify_no_self_test(int hash_nid, const uint8_t *digest,
+                            size_t digest_len, const uint8_t *sig,
+                            size_t sig_len, RSA *rsa) {
   if (rsa->n == NULL || rsa->e == NULL) {
     OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING);
     return 0;
@@ -602,12 +620,9 @@
     return 0;
   }
 
-  if (!RSA_verify_raw(rsa, &len, buf, rsa_size, sig, sig_len,
-                      RSA_PKCS1_PADDING)) {
-    goto out;
-  }
-
-  if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
+  if (!rsa_verify_raw_no_self_test(rsa, &len, buf, rsa_size, sig, sig_len,
+                                   RSA_PKCS1_PADDING) ||
+      !RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len,
                             &signed_msg_is_alloced, hash_nid, digest,
                             digest_len)) {
     goto out;
@@ -630,6 +645,13 @@
   return ret;
 }
 
+int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len,
+               const uint8_t *sig, size_t sig_len, RSA *rsa) {
+  boringssl_ensure_rsa_self_test();
+  return rsa_verify_no_self_test(hash_nid, digest, digest_len, sig, sig_len,
+                                 rsa);
+}
+
 int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len,
                         const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len,
                         const uint8_t *sig, size_t sig_len) {
@@ -905,9 +927,9 @@
     ret = 0;
     goto cleanup;
   }
-#if defined(BORINGSSL_FIPS_BREAK_RSA_PWCT)
-  data[0] = ~data[0];
-#endif
+  if (boringssl_fips_break_test("RSA_PWCT")) {
+    data[0] = ~data[0];
+  }
   if (!RSA_verify(NID_sha256, data, sizeof(data), sig, sig_len, key)) {
     OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR);
     ret = 0;
diff --git a/src/crypto/fipsmodule/rsa/rsa_impl.c b/src/crypto/fipsmodule/rsa/rsa_impl.c
index a6865c0..1046f35 100644
--- a/src/crypto/fipsmodule/rsa/rsa_impl.c
+++ b/src/crypto/fipsmodule/rsa/rsa_impl.c
@@ -261,6 +261,8 @@
 
 int RSA_encrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
                 const uint8_t *in, size_t in_len, int padding) {
+  boringssl_ensure_rsa_self_test();
+
   if (!rsa_check_public_key(rsa)) {
     return 0;
   }
@@ -528,6 +530,8 @@
 
 int rsa_default_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
                         const uint8_t *in, size_t in_len, int padding) {
+  boringssl_ensure_rsa_self_test();
+
   const unsigned rsa_size = RSA_size(rsa);
   uint8_t *buf = NULL;
   int ret = 0;
@@ -593,8 +597,9 @@
 
 static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx);
 
-int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out,
-                   const uint8_t *in, size_t in_len, int padding) {
+int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out,
+                                size_t max_out, const uint8_t *in,
+                                size_t in_len, int padding) {
   if (!rsa_check_public_key(rsa)) {
     return 0;
   }
@@ -686,6 +691,14 @@
   return ret;
 }
 
+int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out,
+                                size_t max_out, const uint8_t *in,
+                                size_t in_len, int padding) {
+  boringssl_ensure_rsa_self_test();
+  return rsa_verify_raw_no_self_test(rsa, out_len, out, max_out, in, in_len,
+                                     padding);
+}
+
 int rsa_default_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in,
                                   size_t len) {
   if (rsa->n == NULL || rsa->d == NULL) {
@@ -1324,6 +1337,8 @@
 static int RSA_generate_key_ex_maybe_fips(RSA *rsa, int bits,
                                           const BIGNUM *e_value, BN_GENCB *cb,
                                           int check_fips) {
+  boringssl_ensure_rsa_self_test();
+
   RSA *tmp = NULL;
   uint32_t err;
   int ret = 0;
diff --git a/src/crypto/fipsmodule/self_check/self_check.c b/src/crypto/fipsmodule/self_check/self_check.c
index 94f2da7..b7cd868 100644
--- a/src/crypto/fipsmodule/self_check/self_check.c
+++ b/src/crypto/fipsmodule/self_check/self_check.c
@@ -20,17 +20,18 @@
 #include <openssl/aead.h>
 #include <openssl/aes.h>
 #include <openssl/bn.h>
-#include <openssl/des.h>
 #include <openssl/dh.h>
 #include <openssl/digest.h>
 #include <openssl/ec.h>
 #include <openssl/ecdsa.h>
 #include <openssl/ec_key.h>
+#include <openssl/hmac.h>
 #include <openssl/nid.h>
 #include <openssl/rsa.h>
 #include <openssl/sha.h>
 
 #include "../../internal.h"
+#include "../dh/internal.h"
 #include "../ec/internal.h"
 #include "../ecdsa/internal.h"
 #include "../rand/internal.h"
@@ -47,21 +48,6 @@
 
 #else
 
-#if defined(BORINGSSL_FIPS) && defined(OPENSSL_ANDROID)
-// FIPS builds on Android will test for flag files, named after the module hash,
-// in /dev/boringssl/selftest/. If such a flag file exists, it's assumed that
-// self-tests have already passed and thus do not need to be repeated. (The
-// integrity tests always run, however.)
-//
-// If self-tests complete successfully and the environment variable named in
-// |kFlagWriteEnableEnvVar| is present, then the flag file will be created. The
-// flag file isn't written without the environment variable being set in order
-// to avoid SELinux violations on Android.
-#define BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
-static const char kFlagPrefix[] = "/dev/boringssl/selftest/";
-static const char kFlagWriteEnableEnvVar[] = "BORINGSSL_SELF_TEST_CREATE_FLAG";
-#endif
-
 static void hexdump(const uint8_t *in, size_t len) {
   for (size_t i = 0; i < len; i++) {
     fprintf(stderr, "%02x", in[i]);
@@ -71,7 +57,7 @@
 static int check_test(const void *expected, const void *actual,
                       size_t expected_len, const char *name) {
   if (OPENSSL_memcmp(actual, expected, expected_len) != 0) {
-    fprintf(stderr, "%s failed.\nExpected: ", name);
+    fprintf(stderr, "%s failed.\nExpected:   ", name);
     hexdump(expected, expected_len);
     fprintf(stderr, "\nCalculated: ");
     hexdump(actual, expected_len);
@@ -87,6 +73,28 @@
   return *out != NULL;
 }
 
+static int serialize_ecdsa_sig(uint8_t *out, size_t out_len,
+                               const ECDSA_SIG *sig) {
+  if ((out_len & 1) ||  //
+      !BN_bn2bin_padded(out, out_len / 2, sig->r) ||
+      !BN_bn2bin_padded(out + out_len / 2, out_len / 2, sig->s)) {
+    return 0;
+  }
+  return 1;
+}
+
+static ECDSA_SIG *parse_ecdsa_sig(const uint8_t *in, size_t in_len) {
+  ECDSA_SIG *ret = ECDSA_SIG_new();
+  if (!ret || //
+      (in_len & 1) ||
+      BN_bin2bn(in, in_len/2, ret->r) == NULL ||
+      BN_bin2bn(in + in_len/2, in_len/2, ret->s) == NULL) {
+    ECDSA_SIG_free(ret);
+    ret = NULL;
+  }
+  return ret;
+}
+
 static RSA *self_test_rsa_key(void) {
   static const uint8_t kN[] = {
       0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc,
@@ -289,195 +297,185 @@
   return NULL;
 }
 
-#if defined(OPENSSL_ANDROID)
-#define MODULE_DIGEST_SIZE SHA256_DIGEST_LENGTH
-#else
-#define MODULE_DIGEST_SIZE SHA512_DIGEST_LENGTH
-#endif
 
-int boringssl_fips_self_test(
-    const uint8_t *module_hash, size_t module_hash_len) {
-#if defined(BORINGSSL_FIPS_SELF_TEST_FLAG_FILE)
-  char flag_path[sizeof(kFlagPrefix) + 2 * MODULE_DIGEST_SIZE];
-  if (module_hash_len != 0) {
-    if (module_hash_len != MODULE_DIGEST_SIZE) {
-      fprintf(stderr,
-              "module hash of length %zu does not match expected length %d\n",
-              module_hash_len, MODULE_DIGEST_SIZE);
-      BORINGSSL_FIPS_abort();
-    }
+// Lazy self-tests
+//
+// Self tests that are slow are deferred until the corresponding algorithm is
+// actually exercised, in FIPS mode. (In non-FIPS mode these tests are only run
+// when requested by |BORINGSSL_self_test|.)
 
-    // Test whether the flag file exists.
-    memcpy(flag_path, kFlagPrefix, sizeof(kFlagPrefix) - 1);
-    static const char kHexTable[17] = "0123456789abcdef";
-    for (size_t i = 0; i < MODULE_DIGEST_SIZE; i++) {
-      flag_path[sizeof(kFlagPrefix) - 1 + 2 * i] =
-          kHexTable[module_hash[i] >> 4];
-      flag_path[sizeof(kFlagPrefix) - 1 + 2 * i + 1] =
-          kHexTable[module_hash[i] & 15];
-    }
-    flag_path[sizeof(flag_path) - 1] = 0;
+static int boringssl_self_test_rsa(void) {
+  int ret = 0;
+  uint8_t output[256];
 
-    if (access(flag_path, F_OK) == 0) {
-      // Flag file found. Skip self-tests.
-      return 1;
-    }
+  RSA *const rsa_key = self_test_rsa_key();
+  if (rsa_key == NULL) {
+    fprintf(stderr, "RSA key construction failed\n");
+    goto err;
   }
-#endif // BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
 
-  static const uint8_t kAESKey[16] = "BoringCrypto Key";
-  static const uint8_t kAESIV[16] = {0};
-  static const uint8_t kPlaintext[64] =
-      "BoringCryptoModule FIPS KAT Encryption and Decryption Plaintext!";
-  static const uint8_t kAESCBCCiphertext[64] = {
-      0x87, 0x2d, 0x98, 0xc2, 0xcc, 0x31, 0x5b, 0x41, 0xe0, 0xfa, 0x7b,
-      0x0a, 0x71, 0xc0, 0x42, 0xbf, 0x4f, 0x61, 0xd0, 0x0d, 0x58, 0x8c,
-      0xf7, 0x05, 0xfb, 0x94, 0x89, 0xd3, 0xbc, 0xaa, 0x1a, 0x50, 0x45,
-      0x1f, 0xc3, 0x8c, 0xb8, 0x98, 0x86, 0xa3, 0xe3, 0x6c, 0xfc, 0xad,
-      0x3a, 0xb5, 0x59, 0x27, 0x7d, 0x21, 0x07, 0xca, 0x4c, 0x1d, 0x55,
-      0x34, 0xdd, 0x5a, 0x2d, 0xc4, 0xb4, 0xf5, 0xa8,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_CBC)
-      0x35
-#else
-      0x00
-#endif
+  // RSA Sign KAT
+
+  static const uint8_t kRSASignDigest[32] = {
+      0xd2, 0xb5, 0x6e, 0x53, 0x30, 0x6f, 0x72, 0x0d, 0x79, 0x29, 0xd8,
+      0x70, 0x8b, 0xf4, 0x6f, 0x1c, 0x22, 0x30, 0x03, 0x05, 0x58, 0x2b,
+      0x11, 0x5b, 0xed, 0xca, 0xc7, 0x22, 0xd8, 0xaa, 0x5a, 0xb2,
   };
-  static const uint8_t kAESGCMCiphertext[80] = {
-      0x4a, 0xd8, 0xe7, 0x7d, 0x78, 0xd7, 0x7d, 0x5e, 0xb2, 0x11, 0xb6, 0xc9,
-      0xa4, 0xbc, 0xb2, 0xae, 0xbe, 0x93, 0xd1, 0xb7, 0xfe, 0x65, 0xc1, 0x82,
-      0x2a, 0xb6, 0x71, 0x5f, 0x1a, 0x7c, 0xe0, 0x1b, 0x2b, 0xe2, 0x53, 0xfa,
-      0xa0, 0x47, 0xfa, 0xd7, 0x8f, 0xb1, 0x4a, 0xc4, 0xdc, 0x89, 0xf9, 0xb4,
-      0x14, 0x4d, 0xde, 0x95, 0xea, 0x29, 0x69, 0x76, 0x81, 0xa3, 0x5c, 0x33,
-      0xd8, 0x37, 0xd8, 0xfa, 0x47, 0x19, 0x46, 0x2f, 0xf1, 0x90, 0xb7, 0x61,
-      0x8f, 0x6f, 0xdd, 0x31, 0x3f, 0x6a, 0x64,
-#if !defined(BORINGSSL_FIPS_BREAK_AES_GCM)
-      0x0d
-#else
-      0x00
-#endif
+  static const uint8_t kRSASignSignature[256] = {
+      0x64, 0xce, 0xdd, 0x91, 0x27, 0xb0, 0x4f, 0xb9, 0x14, 0xea, 0xc0, 0xb4,
+      0xa2, 0x06, 0xc5, 0xd8, 0x40, 0x0f, 0x6c, 0x54, 0xac, 0xf7, 0x02, 0xde,
+      0x26, 0xbb, 0xfd, 0x33, 0xe5, 0x2f, 0x4d, 0xb1, 0x53, 0xc4, 0xff, 0xd0,
+      0x5f, 0xea, 0x15, 0x89, 0x83, 0x4c, 0xe3, 0x80, 0x0b, 0xe9, 0x13, 0x82,
+      0x1d, 0x71, 0x92, 0x1a, 0x03, 0x60, 0x2c, 0xaf, 0xe2, 0x16, 0xc7, 0x43,
+      0x3f, 0xde, 0x6b, 0x94, 0xfd, 0x6e, 0x08, 0x7b, 0x11, 0xf1, 0x34, 0x52,
+      0xe5, 0xc0, 0x97, 0x66, 0x4a, 0xe0, 0x91, 0x45, 0xc8, 0xb1, 0x3d, 0x6a,
+      0x54, 0xc1, 0x32, 0x0f, 0x32, 0xad, 0x25, 0x11, 0x3e, 0x49, 0xad, 0x41,
+      0xce, 0x7b, 0xca, 0x95, 0x6b, 0x54, 0x5e, 0x86, 0x1b, 0xce, 0xfa, 0x2a,
+      0x60, 0xe8, 0xfa, 0xbb, 0x23, 0xb2, 0x41, 0xbc, 0x7c, 0x98, 0xec, 0x73,
+      0x20, 0xed, 0xb3, 0xcf, 0xab, 0x07, 0x24, 0x85, 0x6a, 0x2a, 0x61, 0x76,
+      0x28, 0xf8, 0x00, 0x80, 0xeb, 0xd9, 0x3a, 0x63, 0xe2, 0x01, 0xb1, 0xee,
+      0x6d, 0xe9, 0x73, 0xe9, 0xb6, 0x75, 0x2e, 0xf9, 0x81, 0xd9, 0xa8, 0x79,
+      0xf6, 0x8f, 0xe3, 0x02, 0x7d, 0xf6, 0xea, 0xdc, 0x35, 0xe4, 0x62, 0x0d,
+      0x91, 0xba, 0x3e, 0x7d, 0x8b, 0x82, 0xbf, 0x15, 0x74, 0x6a, 0x4e, 0x29,
+      0xf8, 0x9b, 0x2c, 0x94, 0x8d, 0xa7, 0x00, 0x4d, 0x7b, 0xbf, 0x35, 0x07,
+      0xeb, 0xdd, 0x10, 0xef, 0xd5, 0x2f, 0xe6, 0x98, 0x4b, 0x7e, 0x24, 0x80,
+      0xe2, 0x01, 0xf2, 0x66, 0xb7, 0xd3, 0x93, 0xfe, 0x2a, 0xb3, 0x74, 0xed,
+      0xec, 0x4b, 0xb1, 0x5f, 0x5f, 0xee, 0x85, 0x44, 0xa7, 0x26, 0xdf, 0xc1,
+      0x2e, 0x7a, 0xf3, 0xa5, 0x8f, 0xf8, 0x64, 0xda, 0x65, 0xad, 0x91, 0xe2,
+      0x90, 0x94, 0x20, 0x16, 0xb8, 0x61, 0xa5, 0x0a, 0x7d, 0xb4, 0xbf, 0xc0,
+      0x10, 0xaf, 0x72, 0x67,
   };
-  static const DES_cblock kDESKey1 = {"BCMDESK1"};
-  static const DES_cblock kDESKey2 = {"BCMDESK2"};
-  static const DES_cblock kDESKey3 = {"BCMDESK3"};
-  static const DES_cblock kDESIV = {"BCMDESIV"};
-  static const uint8_t kDESCiphertext[64] = {
-      0xa4, 0x30, 0x7a, 0x4c, 0x1f, 0x60, 0x16, 0xd7, 0x4f, 0x41, 0xe1,
-      0xbb, 0x27, 0xc4, 0x27, 0x37, 0xd4, 0x7f, 0xb9, 0x10, 0xf8, 0xbc,
-      0xaf, 0x93, 0x91, 0xb8, 0x88, 0x24, 0xb1, 0xf6, 0xf8, 0xbd, 0x31,
-      0x96, 0x06, 0x76, 0xde, 0x32, 0xcd, 0x29, 0x29, 0xba, 0x70, 0x5f,
-      0xea, 0xc0, 0xcb, 0xde, 0xc7, 0x75, 0x90, 0xe0, 0x0f, 0x5e, 0x2c,
-      0x0d, 0x49, 0x20, 0xd5, 0x30, 0x83, 0xf8, 0x08,
-#if !defined(BORINGSSL_FIPS_BREAK_DES)
-      0x5a
-#else
-      0x00
-#endif
+
+  unsigned sig_len;
+  if (!rsa_sign_no_self_test(NID_sha256, kRSASignDigest, sizeof(kRSASignDigest),
+                             output, &sig_len, rsa_key) ||
+      !check_test(kRSASignSignature, output, sizeof(kRSASignSignature),
+                  "RSA-sign KAT")) {
+    fprintf(stderr, "RSA signing test failed.\n");
+    goto err;
+  }
+
+  // RSA Verify KAT
+
+  static const uint8_t kRSAVerifyDigest[32] = {
+      0x09, 0x65, 0x2f, 0xd8, 0xed, 0x9d, 0xc2, 0x6d, 0xbc, 0xbf, 0xf2,
+      0xa7, 0xa5, 0xed, 0xe1, 0x37, 0x13, 0x78, 0x21, 0x36, 0xcf, 0x8d,
+      0x22, 0x3d, 0xab, 0x93, 0xb4, 0x12, 0xa8, 0xb5, 0x15, 0x53,
   };
-  static const uint8_t kPlaintextSHA1[20] = {
-      0xc6, 0xf8, 0xc9, 0x63, 0x1c, 0x14, 0x23, 0x62, 0x9b, 0xbd,
-      0x55, 0x82, 0xf4, 0xd6, 0x1d, 0xf2, 0xab, 0x7d, 0xc8,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_1)
-      0x28
-#else
-      0x00
-#endif
+  static const uint8_t kRSAVerifySignature[256] = {
+      0xab, 0xe2, 0xcb, 0xc1, 0x3d, 0x6b, 0xd3, 0x9d, 0x48, 0xdb, 0x53, 0x34,
+      0xdd, 0xbf, 0x8d, 0x07, 0x0a, 0x93, 0xbd, 0xcb, 0x10, 0x4e, 0x2c, 0xc5,
+      0xd0, 0xee, 0x48, 0x6e, 0xe2, 0x95, 0xf6, 0xb3, 0x1b, 0xda, 0x12, 0x6c,
+      0x41, 0x89, 0x0b, 0x98, 0xb7, 0x3e, 0x70, 0xe6, 0xb6, 0x5d, 0x82, 0xf9,
+      0x5c, 0x66, 0x31, 0x21, 0x75, 0x5a, 0x90, 0x74, 0x4c, 0x8d, 0x1c, 0x21,
+      0x14, 0x8a, 0x19, 0x60, 0xbe, 0x0e, 0xca, 0x44, 0x6e, 0x9f, 0xf4, 0x97,
+      0xf1, 0x34, 0x5c, 0x53, 0x7e, 0xf8, 0x11, 0x9b, 0x9a, 0x43, 0x98, 0xe9,
+      0x5c, 0x5c, 0x6d, 0xe2, 0xb1, 0xc9, 0x55, 0x90, 0x5c, 0x52, 0x99, 0xd8,
+      0xce, 0x7a, 0x3b, 0x6a, 0xb7, 0x63, 0x80, 0xd9, 0xba, 0xbd, 0xd1, 0x5f,
+      0x61, 0x02, 0x37, 0xe1, 0xf3, 0xf2, 0xaa, 0x1c, 0x1f, 0x1e, 0x77, 0x0b,
+      0x62, 0xfb, 0xb5, 0x96, 0x38, 0x1b, 0x2e, 0xbd, 0xd7, 0x7e, 0xce, 0xf9,
+      0xc9, 0x0d, 0x4c, 0x92, 0xf7, 0xb6, 0xb0, 0x5f, 0xed, 0x29, 0x36, 0x28,
+      0x5f, 0xa9, 0x48, 0x26, 0xe6, 0x20, 0x55, 0x32, 0x2a, 0x33, 0xb6, 0xf0,
+      0x4c, 0x74, 0xce, 0x69, 0xe5, 0xd8, 0xd7, 0x37, 0xfb, 0x83, 0x8b, 0x79,
+      0xd2, 0xd4, 0x8e, 0x3d, 0xaf, 0x71, 0x38, 0x75, 0x31, 0x88, 0x25, 0x31,
+      0xa9, 0x5a, 0xc9, 0x64, 0xd0, 0x2e, 0xa4, 0x13, 0xbf, 0x85, 0x95, 0x29,
+      0x82, 0xbb, 0xc0, 0x89, 0x52, 0x7d, 0xaf, 0xf5, 0xb8, 0x45, 0xc9, 0xa0,
+      0xf4, 0xd1, 0x4e, 0xf1, 0x95, 0x6d, 0x9c, 0x3a, 0xca, 0xe8, 0x82, 0xd1,
+      0x2d, 0xa6, 0x6d, 0xa0, 0xf3, 0x57, 0x94, 0xf5, 0xee, 0x32, 0x23, 0x23,
+      0x33, 0x51, 0x7d, 0xb9, 0x31, 0x52, 0x32, 0xa1, 0x83, 0xb9, 0x91, 0x65,
+      0x4d, 0xbe, 0xa4, 0x16, 0x15, 0x34, 0x5c, 0x88, 0x53, 0x25, 0x92, 0x67,
+      0x44, 0xa5, 0x39, 0x15,
   };
-  static const uint8_t kPlaintextSHA256[32] = {
-      0x37, 0xbd, 0x70, 0x53, 0x72, 0xfc, 0xd4, 0x03, 0x79, 0x70, 0xfb,
-      0x06, 0x95, 0xb1, 0x2a, 0x82, 0x48, 0xe1, 0x3e, 0xf2, 0x33, 0xfb,
-      0xef, 0x29, 0x81, 0x22, 0x45, 0x40, 0x43, 0x70, 0xce,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_256)
-      0x0f
-#else
-      0x00
-#endif
+  if (!rsa_verify_no_self_test(NID_sha256, kRSAVerifyDigest,
+                               sizeof(kRSAVerifyDigest), kRSAVerifySignature,
+                               sizeof(kRSAVerifySignature), rsa_key)) {
+    fprintf(stderr, "RSA-verify KAT failed.\n");
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  RSA_free(rsa_key);
+
+  return ret;
+}
+
+static int boringssl_self_test_ecc(void) {
+  int ret = 0;
+  EC_KEY *ec_key = NULL;
+  EC_GROUP *ec_group = NULL;
+  EC_POINT *ec_point_in = NULL;
+  EC_POINT *ec_point_out = NULL;
+  BIGNUM *ec_scalar = NULL;
+  ECDSA_SIG *sig = NULL;
+
+  ec_key = self_test_ecdsa_key();
+  if (ec_key == NULL) {
+    fprintf(stderr, "ECDSA KeyGen failed\n");
+    goto err;
+  }
+
+  // ECDSA Sign/Verify KAT
+
+  static const uint8_t kECDSASignDigest[32] = {
+      0x1e, 0x35, 0x93, 0x0b, 0xe8, 0x60, 0xd0, 0x94, 0x2c, 0xa7, 0xbb,
+      0xd6, 0xf6, 0xde, 0xd8, 0x7f, 0x15, 0x7e, 0x4d, 0xe2, 0x4f, 0x81,
+      0xed, 0x4b, 0x87, 0x5c, 0x0e, 0x01, 0x8e, 0x89, 0xa8, 0x1f,
   };
-  static const uint8_t kPlaintextSHA512[64] = {
-      0x08, 0x6a, 0x1c, 0x84, 0x61, 0x9d, 0x8e, 0xb3, 0xc0, 0x97, 0x4e,
-      0xa1, 0x9f, 0x9c, 0xdc, 0xaf, 0x3b, 0x5c, 0x31, 0xf0, 0xf2, 0x74,
-      0xc3, 0xbd, 0x6e, 0xd6, 0x1e, 0xb2, 0xbb, 0x34, 0x74, 0x72, 0x5c,
-      0x51, 0x29, 0x8b, 0x87, 0x3a, 0xa3, 0xf2, 0x25, 0x23, 0xd4, 0x1c,
-      0x82, 0x1b, 0xfe, 0xd3, 0xc6, 0xee, 0xb5, 0xd6, 0xaf, 0x07, 0x7b,
-      0x98, 0xca, 0xa7, 0x01, 0xf3, 0x94, 0xf3, 0x68,
-#if !defined(BORINGSSL_FIPS_BREAK_SHA_512)
-      0x14
-#else
-      0x00
-#endif
-  };
-  static const uint8_t kRSASignature[256] = {
-      0x62, 0x66, 0x4b, 0xe3, 0xb1, 0xd2, 0x83, 0xf1, 0xa8, 0x56, 0x2b, 0x33,
-      0x60, 0x1e, 0xdb, 0x1e, 0x06, 0xf7, 0xa7, 0x1e, 0xa8, 0xef, 0x03, 0x4d,
-      0x0c, 0xf6, 0x83, 0x75, 0x7a, 0xf0, 0x14, 0xc7, 0xe2, 0x94, 0x3a, 0xb5,
-      0x67, 0x56, 0xa5, 0x48, 0x7f, 0x3a, 0xa5, 0xbf, 0xf7, 0x1d, 0x44, 0xa6,
-      0x34, 0xed, 0x9b, 0xd6, 0x51, 0xaa, 0x2c, 0x4e, 0xce, 0x60, 0x5f, 0xe9,
-      0x0e, 0xd5, 0xcd, 0xeb, 0x23, 0x27, 0xf8, 0xfb, 0x45, 0xe5, 0x34, 0x63,
-      0x77, 0x7f, 0x2e, 0x80, 0xcf, 0x9d, 0x2e, 0xfc, 0xe2, 0x50, 0x75, 0x29,
-      0x46, 0xf4, 0xaf, 0x91, 0xed, 0x36, 0xe1, 0x5e, 0xef, 0x66, 0xa1, 0xff,
-      0x27, 0xfc, 0x87, 0x7e, 0x60, 0x84, 0x0f, 0x54, 0x51, 0x56, 0x0f, 0x68,
-      0x99, 0xc0, 0x3f, 0xeb, 0xa5, 0xa0, 0x46, 0xb0, 0x86, 0x02, 0xb0, 0xc8,
-      0xe8, 0x46, 0x13, 0x06, 0xcd, 0xb7, 0x8a, 0xd0, 0x3b, 0x46, 0xd0, 0x14,
-      0x64, 0x53, 0x9b, 0x5b, 0x5e, 0x02, 0x45, 0xba, 0x6e, 0x7e, 0x0a, 0xb9,
-      0x9e, 0x62, 0xb7, 0xd5, 0x7a, 0x87, 0xea, 0xd3, 0x24, 0xa5, 0xef, 0xb3,
-      0xdc, 0x05, 0x9c, 0x04, 0x60, 0x4b, 0xde, 0xa8, 0x90, 0x08, 0x7b, 0x6a,
-      0x5f, 0xb4, 0x3f, 0xda, 0xc5, 0x1f, 0x6e, 0xd6, 0x15, 0xde, 0x65, 0xa4,
-      0x6e, 0x62, 0x9d, 0x8f, 0xa8, 0xbe, 0x86, 0xf6, 0x09, 0x90, 0x40, 0xa5,
-      0xf4, 0x23, 0xc5, 0xf6, 0x38, 0x86, 0x0d, 0x1c, 0xed, 0x4a, 0x0a, 0xae,
-      0xa4, 0x26, 0xc2, 0x2e, 0xd3, 0x13, 0x66, 0x61, 0xea, 0x35, 0x01, 0x0e,
-      0x13, 0xda, 0x78, 0x20, 0xae, 0x59, 0x5f, 0x9b, 0xa9, 0x6c, 0xf9, 0x1b,
-      0xdf, 0x76, 0x53, 0xc8, 0xa7, 0xf5, 0x63, 0x6d, 0xf3, 0xff, 0xfd, 0xaf,
-      0x75, 0x4b, 0xac, 0x67, 0xb1, 0x3c, 0xbf, 0x5e, 0xde, 0x73, 0x02, 0x6d,
-      0xd2, 0x0c, 0xb1,
-#if !defined(BORINGSSL_FIPS_BREAK_RSA_SIG)
-      0x64
-#else
-      0x00
-#endif
-  };
-  const uint8_t kDRBGEntropy[48] =
-      "BCM Known Answer Test DBRG Initial Entropy      ";
-  const uint8_t kDRBGPersonalization[18] = "BCMPersonalization";
-  const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD ";
-  const uint8_t kDRBGOutput[64] = {
-      0x1d, 0x63, 0xdf, 0x05, 0x51, 0x49, 0x22, 0x46, 0xcd, 0x9b, 0xc5,
-      0xbb, 0xf1, 0x5d, 0x44, 0xae, 0x13, 0x78, 0xb1, 0xe4, 0x7c, 0xf1,
-      0x96, 0x33, 0x3d, 0x60, 0xb6, 0x29, 0xd4, 0xbb, 0x6b, 0x44, 0xf9,
-      0xef, 0xd9, 0xf4, 0xa2, 0xba, 0x48, 0xea, 0x39, 0x75, 0x59, 0x32,
-      0xf7, 0x31, 0x2c, 0x98, 0x14, 0x2b, 0x49, 0xdf, 0x02, 0xb6, 0x5d,
-      0x71, 0x09, 0x50, 0xdb, 0x23, 0xdb, 0xe5, 0x22,
-#if !defined(BORINGSSL_FIPS_BREAK_DRBG)
-      0x95
-#else
-      0x00
-#endif
-  };
-  const uint8_t kDRBGEntropy2[48] =
-      "BCM Known Answer Test DBRG Reseed Entropy       ";
-  const uint8_t kDRBGReseedOutput[64] = {
-      0xa4, 0x77, 0x05, 0xdb, 0x14, 0x11, 0x76, 0x71, 0x42, 0x5b, 0xd8,
-      0xd7, 0xa5, 0x4f, 0x8b, 0x39, 0xf2, 0x10, 0x4a, 0x50, 0x5b, 0xa2,
-      0xc8, 0xf0, 0xbb, 0x3e, 0xa1, 0xa5, 0x90, 0x7d, 0x54, 0xd9, 0xc6,
-      0xb0, 0x96, 0xc0, 0x2b, 0x7e, 0x9b, 0xc9, 0xa1, 0xdd, 0x78, 0x2e,
-      0xd5, 0xa8, 0x66, 0x16, 0xbd, 0x18, 0x3c, 0xf2, 0xaa, 0x7a, 0x2b,
-      0x37, 0xf9, 0xab, 0x35, 0x64, 0x15, 0x01, 0x3f, 0xc4,
-  };
-  const uint8_t kECDSASigR[32] = {
+  static const uint8_t kECDSASignSig[64] = {
       0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0,
       0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02,
-      0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5,
-#if !defined(BORINGSSL_FIPS_BREAK_ECDSA_SIG)
-      0x0c,
-#else
-      0x00,
-#endif
+      0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x68,
+      0x04, 0x73, 0x40, 0x94, 0xb2, 0xd1, 0x90, 0xac, 0x2d, 0x0c, 0xd7,
+      0xa5, 0x7f, 0x2f, 0x2e, 0xb2, 0x62, 0xb0, 0x09, 0x16, 0xe1, 0xa6,
+      0x70, 0xb5, 0xbb, 0x0d, 0xfd, 0x8e, 0x0c, 0x02, 0x3f,
   };
-  const uint8_t kECDSASigS[32] = {
-      0xa5, 0x93, 0xe0, 0x23, 0x91, 0xe7, 0x4b, 0x8d, 0x77, 0x25, 0xa6,
-      0xba, 0x4d, 0xd9, 0x86, 0x77, 0xda, 0x7d, 0x8f, 0xef, 0xc4, 0x1a,
-      0xf0, 0xcc, 0x81, 0xe5, 0xea, 0x3f, 0xc2, 0x41, 0x7f, 0xd8,
+
+  // The 'k' value for ECDSA is fixed to avoid an entropy draw.
+  uint8_t ecdsa_k[32] = {0};
+  ecdsa_k[31] = 42;
+
+  sig = ecdsa_sign_with_nonce_for_known_answer_test(
+      kECDSASignDigest, sizeof(kECDSASignDigest), ec_key, ecdsa_k,
+      sizeof(ecdsa_k));
+
+  uint8_t ecdsa_sign_output[64];
+  if (sig == NULL ||
+      !serialize_ecdsa_sig(ecdsa_sign_output, sizeof(ecdsa_sign_output), sig) ||
+      !check_test(kECDSASignSig, ecdsa_sign_output, sizeof(ecdsa_sign_output),
+                  "ECDSA-sign signature")) {
+    fprintf(stderr, "ECDSA-sign KAT failed.\n");
+    goto err;
+  }
+
+  static const uint8_t kECDSAVerifyDigest[32] = {
+      0x78, 0x7c, 0x50, 0x5c, 0x60, 0xc9, 0xe4, 0x13, 0x6c, 0xe4, 0x48,
+      0xba, 0x93, 0xff, 0x71, 0xfa, 0x9c, 0x18, 0xf4, 0x17, 0x09, 0x4f,
+      0xdf, 0x5a, 0xe2, 0x75, 0xc0, 0xcc, 0xd2, 0x67, 0x97, 0xad,
   };
+  static const uint8_t kECDSAVerifySig[64] = {
+      0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0,
+      0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02,
+      0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x2d,
+      0x36, 0xe5, 0x79, 0x97, 0x90, 0xbf, 0xbe, 0x21, 0x83, 0xd3, 0x3e,
+      0x96, 0xf3, 0xc5, 0x1f, 0x6a, 0x23, 0x2f, 0x2a, 0x24, 0x48, 0x8c,
+      0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29,
+  };
+
+  ECDSA_SIG_free(sig);
+  sig = parse_ecdsa_sig(kECDSAVerifySig, sizeof(kECDSAVerifySig));
+  if (!sig ||
+      !ecdsa_do_verify_no_self_test(kECDSAVerifyDigest,
+                                    sizeof(kECDSAVerifyDigest), sig, ec_key)) {
+    fprintf(stderr, "ECDSA-verify KAT failed.\n");
+    goto err;
+  }
+
+  // Primitive Z Computation KAT (IG 9.6).
+
   // kP256Point is SHA256("Primitive Z Computation KAT")×G within P-256.
-  const uint8_t kP256Point[65] = {
+  static const uint8_t kP256Point[65] = {
       0x04, 0x4e, 0xc1, 0x94, 0x8c, 0x5c, 0xf4, 0x37, 0x35, 0x0d, 0xa3,
       0xf9, 0x55, 0xf9, 0x8b, 0x26, 0x23, 0x5c, 0x43, 0xe0, 0x83, 0x51,
       0x2b, 0x0d, 0x4b, 0x56, 0x24, 0xc3, 0xe4, 0xa5, 0xa8, 0xe2, 0xe9,
@@ -486,49 +484,63 @@
       0x79, 0x93, 0x7c, 0x0b, 0x92, 0x2b, 0x7f, 0x17, 0xa5, 0x80,
   };
   // kP256Scalar is SHA256("Primitive Z Computation KAT scalar").
-  const uint8_t kP256Scalar[32] = {
+  static const uint8_t kP256Scalar[32] = {
       0xe7, 0x60, 0x44, 0x91, 0x26, 0x9a, 0xfb, 0x5b, 0x10, 0x2d, 0x6e,
       0xa5, 0x2c, 0xb5, 0x9f, 0xeb, 0x70, 0xae, 0xde, 0x6c, 0xe3, 0xbf,
       0xb3, 0xe0, 0x10, 0x54, 0x85, 0xab, 0xd8, 0x61, 0xd7, 0x7b,
   };
   // kP256PointResult is |kP256Scalar|×|kP256Point|.
-  const uint8_t kP256PointResult[65] = {
+  static const uint8_t kP256PointResult[65] = {
       0x04, 0xf1, 0x63, 0x00, 0x88, 0xc5, 0xd5, 0xe9, 0x05, 0x52, 0xac,
       0xb6, 0xec, 0x68, 0x76, 0xb8, 0x73, 0x7f, 0x0f, 0x72, 0x34, 0xe6,
       0xbb, 0x30, 0x32, 0x22, 0x37, 0xb6, 0x2a, 0x80, 0xe8, 0x9e, 0x6e,
       0x6f, 0x36, 0x02, 0xe7, 0x21, 0xd2, 0x31, 0xdb, 0x94, 0x63, 0xb7,
       0xd8, 0x19, 0x0e, 0xc2, 0xc0, 0xa7, 0x2f, 0x15, 0x49, 0x1a, 0xa2,
-      0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a,
-#if !defined(BORINGSSL_FIPS_BREAK_Z_COMPUTATION)
-      0x0c,
-#else
-      0x00,
-#endif
+      0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a, 0x0c,
   };
-  const uint8_t kTLSOutput[32] = {
-      0x67, 0x85, 0xde, 0x60, 0xfc, 0x0a, 0x83, 0xe9, 0xa2, 0x2a, 0xb3,
-      0xf0, 0x27, 0x0c, 0xba, 0xf7, 0xfa, 0x82, 0x3d, 0x14, 0x77, 0x1d,
-      0x86, 0x29, 0x79, 0x39, 0x77, 0x8a, 0xd5, 0x0e, 0x9d,
-#if !defined(BORINGSSL_FIPS_BREAK_TLS_KDF)
-      0x32,
-#else
-      0x00,
-#endif
-  };
-  const uint8_t kTLSSecret[32] = {
-      0xbf, 0xe4, 0xb7, 0xe0, 0x26, 0x55, 0x5f, 0x6a, 0xdf, 0x5d, 0x27,
-      0xd6, 0x89, 0x99, 0x2a, 0xd6, 0xf7, 0x65, 0x66, 0x07, 0x4b, 0x55,
-      0x5f, 0x64, 0x55, 0xcd, 0xd5, 0x77, 0xa4, 0xc7, 0x09, 0x61,
-  };
-  const char kTLSLabel[] = "FIPS self test";
-  const uint8_t kTLSSeed1[16] = {
-      0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2,
-      0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65,
-  };
-  const uint8_t kTLSSeed2[16] = {
-      0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c,
-      0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81,
-  };
+
+  ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
+  if (ec_group == NULL) {
+    fprintf(stderr, "Failed to create P-256 group.\n");
+    goto err;
+  }
+  ec_point_in = EC_POINT_new(ec_group);
+  ec_point_out = EC_POINT_new(ec_group);
+  ec_scalar = BN_new();
+  uint8_t z_comp_result[65];
+  if (ec_point_in == NULL || ec_point_out == NULL || ec_scalar == NULL ||
+      !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point),
+                          NULL) ||
+      !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) ||
+      !ec_point_mul_no_self_test(ec_group, ec_point_out, NULL, ec_point_in,
+                                 ec_scalar, NULL) ||
+      !EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED,
+                          z_comp_result, sizeof(z_comp_result), NULL) ||
+      !check_test(kP256PointResult, z_comp_result, sizeof(z_comp_result),
+                  "Z Computation Result")) {
+    fprintf(stderr, "Z-computation KAT failed.\n");
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  EC_KEY_free(ec_key);
+  EC_POINT_free(ec_point_in);
+  EC_POINT_free(ec_point_out);
+  EC_GROUP_free(ec_group);
+  BN_free(ec_scalar);
+  ECDSA_SIG_free(sig);
+
+  return ret;
+}
+
+static int boringssl_self_test_ffdh(void) {
+  int ret = 0;
+  DH *dh = NULL;
+  BIGNUM *ffdhe2048_value = NULL;
+
+  // FFC Diffie-Hellman KAT
 
   // kFFDHE2048PublicValueData is an arbitrary public value, mod
   // kFFDHE2048Data. (The private key happens to be 4096.)
@@ -550,8 +562,7 @@
       TOBN(0xbae7b0b3, 0x6e362dc0), TOBN(0xa57c73bd, 0xdc70fb82),
       TOBN(0xfaff50d2, 0x9d573457), TOBN(0x352bd399, 0xbe84058e),
   };
-
-  const uint8_t kDHOutput[2048 / 8] = {
+  static const uint8_t kDHOutput[2048 / 8] = {
       0x2a, 0xe6, 0xd3, 0xa6, 0x13, 0x58, 0x8e, 0xce, 0x53, 0xaa, 0xf6, 0x5d,
       0x9a, 0xae, 0x02, 0x12, 0xf5, 0x80, 0x3d, 0x06, 0x09, 0x76, 0xac, 0x57,
       0x37, 0x9e, 0xab, 0x38, 0x62, 0x25, 0x05, 0x1d, 0xf3, 0xa9, 0x39, 0x60,
@@ -573,23 +584,144 @@
       0x06, 0x80, 0x2a, 0x4e, 0x5a, 0xf0, 0x1e, 0xaa, 0xcb, 0xab, 0x06, 0x0e,
       0x27, 0x0f, 0xd9, 0x88, 0xd9, 0x01, 0xe3, 0x07, 0xeb, 0xdf, 0xc3, 0x12,
       0xe3, 0x40, 0x88, 0x7b, 0x5f, 0x59, 0x78, 0x6e, 0x26, 0x20, 0xc3, 0xdf,
-      0xc8, 0xe4, 0x5e,
-#if !defined(BORINGSSL_FIPS_BREAK_FFC_DH)
-      0xb8,
-#else
-      0x00,
-#endif
+      0xc8, 0xe4, 0x5e, 0xb8,
   };
 
+  ffdhe2048_value = BN_new();
+  if (ffdhe2048_value) {
+    bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData,
+                        OPENSSL_ARRAY_SIZE(kFFDHE2048PublicValueData));
+  }
+
+  dh = self_test_dh();
+  uint8_t dh_out[sizeof(kDHOutput)];
+  if (dh == NULL || ffdhe2048_value == NULL || sizeof(dh_out) != DH_size(dh) ||
+      dh_compute_key_padded_no_self_test(dh_out, ffdhe2048_value, dh) !=
+          sizeof(dh_out) ||
+      !check_test(kDHOutput, dh_out, sizeof(dh_out), "FFC DH")) {
+    fprintf(stderr, "FFDH failed.\n");
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  DH_free(dh);
+  BN_free(ffdhe2048_value);
+
+  return ret;
+}
+
+#if defined(BORINGSSL_FIPS)
+
+static void run_self_test_rsa(void) {
+  if (!boringssl_self_test_rsa()) {
+    BORINGSSL_FIPS_abort();
+  }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_rsa);
+
+void boringssl_ensure_rsa_self_test(void) {
+  CRYPTO_once(g_self_test_once_rsa_bss_get(), run_self_test_rsa);
+}
+
+static void run_self_test_ecc(void) {
+  if (!boringssl_self_test_ecc()) {
+    BORINGSSL_FIPS_abort();
+  }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_ecc);
+
+void boringssl_ensure_ecc_self_test(void) {
+  CRYPTO_once(g_self_test_once_ecc_bss_get(), run_self_test_ecc);
+}
+
+static void run_self_test_ffdh(void) {
+  if (!boringssl_self_test_ffdh()) {
+    BORINGSSL_FIPS_abort();
+  }
+}
+
+DEFINE_STATIC_ONCE(g_self_test_once_ffdh);
+
+void boringssl_ensure_ffdh_self_test(void) {
+  CRYPTO_once(g_self_test_once_ffdh_bss_get(), run_self_test_ffdh);
+}
+
+#endif  // BORINGSSL_FIPS
+
+
+// Startup self tests.
+//
+// These tests are run at process start when in FIPS mode.
+
+int boringssl_self_test_sha256(void) {
+  static const uint8_t kInput[16] = {
+      0xff, 0x3b, 0x85, 0x7d, 0xa7, 0x23, 0x6a, 0x2b,
+      0xaa, 0x0f, 0x39, 0x6b, 0x51, 0x52, 0x22, 0x17,
+  };
+  static const uint8_t kPlaintextSHA256[32] = {
+      0x7f, 0xe4, 0xd5, 0xf1, 0xa1, 0xe3, 0x82, 0x87, 0xd9, 0x58, 0xf5,
+      0x11, 0xc7, 0x1d, 0x5e, 0x27, 0x5e, 0xcc, 0xd2, 0x66, 0xcf, 0xb9,
+      0xc8, 0xc6, 0x60, 0xd8, 0x92, 0x1e, 0x57, 0xfd, 0x46, 0x75,
+  };
+  uint8_t output[SHA256_DIGEST_LENGTH];
+
+  // SHA-256 KAT
+  SHA256(kInput, sizeof(kInput), output);
+  return check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256),
+                    "SHA-256 KAT");
+}
+
+int boringssl_self_test_sha512(void) {
+  static const uint8_t kInput[16] = {
+      0x21, 0x25, 0x12, 0xf8, 0xd2, 0xad, 0x83, 0x22,
+      0x78, 0x1c, 0x6c, 0x4d, 0x69, 0xa9, 0xda, 0xa1,
+  };
+  static const uint8_t kPlaintextSHA512[64] = {
+      0x29, 0x3c, 0x94, 0x35, 0x4e, 0x98, 0x83, 0xe5, 0xc2, 0x78, 0x36,
+      0x7a, 0xe5, 0x18, 0x90, 0xbf, 0x35, 0x41, 0x01, 0x64, 0x19, 0x8d,
+      0x26, 0xeb, 0xe1, 0xf8, 0x2f, 0x04, 0x8e, 0xfa, 0x8b, 0x2b, 0xc6,
+      0xb2, 0x9d, 0x5d, 0x46, 0x76, 0x5a, 0xc8, 0xb5, 0x25, 0xa3, 0xea,
+      0x52, 0x84, 0x47, 0x6d, 0x6d, 0xf4, 0xc9, 0x71, 0xf3, 0x3d, 0x89,
+      0x4c, 0x3b, 0x20, 0x8c, 0x5b, 0x75, 0xe8, 0xf8, 0x7c,
+  };
+  uint8_t output[SHA512_DIGEST_LENGTH];
+
+  // SHA-512 KAT
+  SHA512(kInput, sizeof(kInput), output);
+  return check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512),
+                    "SHA-512 KAT");
+}
+
+int boringssl_self_test_hmac_sha256(void) {
+  static const uint8_t kInput[16] = {
+      0xda, 0xd9, 0x12, 0x93, 0xdf, 0xcf, 0x2a, 0x7c,
+      0x8e, 0xcd, 0x13, 0xfe, 0x35, 0x3f, 0xa7, 0x5b,
+  };
+  static const uint8_t kPlaintextHMACSHA256[32] = {
+      0x36, 0x5f, 0x5b, 0xd5, 0xf5, 0xeb, 0xfd, 0xc7, 0x6e, 0x53, 0xa5,
+      0x73, 0x6d, 0x73, 0x20, 0x13, 0xaa, 0xd3, 0xbc, 0x86, 0x4b, 0xb8,
+      0x84, 0x94, 0x16, 0x46, 0x88, 0x9c, 0x48, 0xee, 0xa9, 0x0e,
+  };
+  uint8_t output[EVP_MAX_MD_SIZE];
+
+  unsigned output_len;
+  HMAC(EVP_sha256(), kInput, sizeof(kInput), kInput, sizeof(kInput), output,
+       &output_len);
+  return output_len == sizeof(kPlaintextHMACSHA256) &&
+         check_test(kPlaintextHMACSHA256, output, sizeof(kPlaintextHMACSHA256),
+                    "HMAC-SHA-256 KAT");
+}
+
+static int boringssl_self_test_fast(void) {
+  static const uint8_t kAESKey[16] = "BoringCrypto Key";
+  static const uint8_t kAESIV[16] = {0};
+
   EVP_AEAD_CTX aead_ctx;
   EVP_AEAD_CTX_zero(&aead_ctx);
-  RSA *rsa_key = NULL;
-  EC_KEY *ec_key = NULL;
-  EC_GROUP *ec_group = NULL;
-  EC_POINT *ec_point_in = NULL;
-  EC_POINT *ec_point_out = NULL;
-  BIGNUM *ec_scalar = NULL;
-  ECDSA_SIG *sig = NULL;
   int ret = 0;
 
   AES_KEY aes_key;
@@ -597,28 +729,48 @@
   uint8_t output[256];
 
   // AES-CBC Encryption KAT
+  static const uint8_t kAESCBCEncPlaintext[32] = {
+      0x07, 0x86, 0x09, 0xa6, 0xc5, 0xac, 0x25, 0x44, 0x69, 0x9a, 0xdf,
+      0x68, 0x2f, 0xa3, 0x77, 0xf9, 0xbe, 0x8a, 0xb6, 0xae, 0xf5, 0x63,
+      0xe8, 0xc5, 0x6a, 0x36, 0xb8, 0x4f, 0x55, 0x7f, 0xad, 0xd3,
+  };
+  static const uint8_t kAESCBCEncCiphertext[sizeof(kAESCBCEncPlaintext)] = {
+      0x56, 0x46, 0xc1, 0x41, 0xf4, 0x13, 0xd6, 0xff, 0x62, 0x92, 0x41,
+      0x7a, 0x26, 0xc6, 0x86, 0xbd, 0x30, 0x5f, 0xb6, 0x57, 0xa7, 0xd2,
+      0x50, 0x3a, 0xc5, 0x5e, 0x8e, 0x93, 0x40, 0xf2, 0x10, 0xd8,
+  };
   memcpy(aes_iv, kAESIV, sizeof(kAESIV));
   if (AES_set_encrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
     fprintf(stderr, "AES_set_encrypt_key failed.\n");
     goto err;
   }
-  AES_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &aes_key, aes_iv,
-                  AES_ENCRYPT);
-  if (!check_test(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
-                  "AES-CBC Encryption KAT")) {
+  AES_cbc_encrypt(kAESCBCEncPlaintext, output, sizeof(kAESCBCEncPlaintext),
+                  &aes_key, aes_iv, AES_ENCRYPT);
+  if (!check_test(kAESCBCEncCiphertext, output, sizeof(kAESCBCEncCiphertext),
+                  "AES-CBC-encrypt KAT")) {
     goto err;
   }
 
   // AES-CBC Decryption KAT
+  static const uint8_t kAESCBCDecCiphertext[32] = {
+      0x34, 0x7a, 0xa5, 0xa0, 0x24, 0xb2, 0x82, 0x57, 0xb3, 0x65, 0x10,
+      0xbe, 0x58, 0x3d, 0x4f, 0x47, 0xad, 0xb7, 0xbb, 0xee, 0xdc, 0x60,
+      0x05, 0xbb, 0xbd, 0x0d, 0x0a, 0x9f, 0x06, 0xbb, 0x7b, 0x10,
+  };
+  static const uint8_t kAESCBCDecPlaintext[sizeof(kAESCBCDecCiphertext)] = {
+      0x51, 0xa7, 0xa0, 0x1f, 0x6b, 0x79, 0x6c, 0xcd, 0x48, 0x03, 0xa1,
+      0x41, 0xdc, 0x56, 0xa6, 0xc2, 0x16, 0xb5, 0xd1, 0xd3, 0xb7, 0x06,
+      0xb2, 0x25, 0x6f, 0xa6, 0xd0, 0xd2, 0x0e, 0x6f, 0x19, 0xb5,
+  };
   memcpy(aes_iv, kAESIV, sizeof(kAESIV));
   if (AES_set_decrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) {
     fprintf(stderr, "AES_set_decrypt_key failed.\n");
     goto err;
   }
-  AES_cbc_encrypt(kAESCBCCiphertext, output, sizeof(kAESCBCCiphertext),
+  AES_cbc_encrypt(kAESCBCDecCiphertext, output, sizeof(kAESCBCDecCiphertext),
                   &aes_key, aes_iv, AES_DECRYPT);
-  if (!check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "AES-CBC Decryption KAT")) {
+  if (!check_test(kAESCBCDecPlaintext, output, sizeof(kAESCBCDecPlaintext),
+                  "AES-CBC-decrypt KAT")) {
     goto err;
   }
 
@@ -632,194 +784,115 @@
   }
 
   // AES-GCM Encryption KAT
+  static const uint8_t kAESGCMEncPlaintext[32] = {
+      0x8f, 0xcc, 0x40, 0x99, 0x80, 0x8e, 0x75, 0xca, 0xaf, 0xf5, 0x82,
+      0x89, 0x88, 0x48, 0xa8, 0x8d, 0x80, 0x8b, 0x55, 0xab, 0x4e, 0x93,
+      0x70, 0x79, 0x7d, 0x94, 0x0b, 0xe8, 0xcc, 0x1d, 0x78, 0x84,
+  };
+  static const uint8_t kAESGCMCiphertext[sizeof(kAESGCMEncPlaintext) + 16] = {
+      0x87, 0x7b, 0xd5, 0x8d, 0x96, 0x3e, 0x4b, 0xe6, 0x64, 0x94, 0x40, 0x2f,
+      0x61, 0x9b, 0x7e, 0x56, 0x52, 0x7d, 0xa4, 0x5a, 0xf9, 0xa6, 0xe2, 0xdb,
+      0x1c, 0x63, 0x2e, 0x97, 0x93, 0x0f, 0xfb, 0xed, 0xb5, 0x9e, 0x1c, 0x20,
+      0xb2, 0xb0, 0x58, 0xda, 0x48, 0x07, 0x2d, 0xbd, 0x96, 0x0d, 0x34, 0xc6,
+  };
   if (!EVP_AEAD_CTX_seal(&aead_ctx, output, &out_len, sizeof(output), nonce,
                          EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
-                         kPlaintext, sizeof(kPlaintext), NULL, 0) ||
+                         kAESGCMEncPlaintext, sizeof(kAESGCMEncPlaintext), NULL,
+                         0) ||
       !check_test(kAESGCMCiphertext, output, sizeof(kAESGCMCiphertext),
-                  "AES-GCM Encryption KAT")) {
+                  "AES-GCM-encrypt KAT")) {
     fprintf(stderr, "EVP_AEAD_CTX_seal for AES-128-GCM failed.\n");
     goto err;
   }
 
   // AES-GCM Decryption KAT
+  static const uint8_t kAESGCMDecCiphertext[48] = {
+      0x35, 0xf3, 0x05, 0x8f, 0x87, 0x57, 0x60, 0xff, 0x09, 0xd3, 0x12, 0x0f,
+      0x70, 0xc4, 0xbc, 0x9e, 0xd7, 0xa8, 0x68, 0x72, 0xe1, 0x34, 0x52, 0x20,
+      0x21, 0x76, 0xf7, 0x37, 0x1a, 0xe0, 0x4f, 0xaa, 0xe1, 0xdd, 0x39, 0x19,
+      0x20, 0xf5, 0xd1, 0x39, 0x53, 0xd8, 0x96, 0x78, 0x59, 0x94, 0x82, 0x3c,
+  };
+  static const uint8_t kAESGCMDecPlaintext[sizeof(kAESGCMDecCiphertext) - 16] =
+      {
+          0x3d, 0x44, 0x90, 0x9b, 0x91, 0xe7, 0x5e, 0xd3, 0xc2, 0xb2, 0xd0,
+          0xa9, 0x99, 0x17, 0x6a, 0x45, 0x05, 0x5e, 0x99, 0x83, 0x56, 0x01,
+          0xc0, 0x82, 0x40, 0x81, 0xd2, 0x48, 0x45, 0xf2, 0xcc, 0xc3,
+      };
   if (!EVP_AEAD_CTX_open(&aead_ctx, output, &out_len, sizeof(output), nonce,
                          EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()),
-                         kAESGCMCiphertext, sizeof(kAESGCMCiphertext), NULL,
-                         0) ||
-      !check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "AES-GCM Decryption KAT")) {
-    fprintf(stderr, "EVP_AEAD_CTX_open for AES-128-GCM failed.\n");
-    goto err;
-  }
-
-  DES_key_schedule des1, des2, des3;
-  DES_cblock des_iv;
-  DES_set_key(&kDESKey1, &des1);
-  DES_set_key(&kDESKey2, &des2);
-  DES_set_key(&kDESKey3, &des3);
-
-  // 3DES Encryption KAT
-  memcpy(&des_iv, &kDESIV, sizeof(des_iv));
-  DES_ede3_cbc_encrypt(kPlaintext, output, sizeof(kPlaintext), &des1, &des2,
-                       &des3, &des_iv, DES_ENCRYPT);
-  if (!check_test(kDESCiphertext, output, sizeof(kDESCiphertext),
-                  "3DES Encryption KAT")) {
-    goto err;
-  }
-
-  // 3DES Decryption KAT
-  memcpy(&des_iv, &kDESIV, sizeof(des_iv));
-  DES_ede3_cbc_encrypt(kDESCiphertext, output, sizeof(kDESCiphertext), &des1,
-                       &des2, &des3, &des_iv, DES_DECRYPT);
-  if (!check_test(kPlaintext, output, sizeof(kPlaintext),
-                  "3DES Decryption KAT")) {
+                         kAESGCMDecCiphertext, sizeof(kAESGCMDecCiphertext),
+                         NULL, 0) ||
+      !check_test(kAESGCMDecPlaintext, output, sizeof(kAESGCMDecPlaintext),
+                  "AES-GCM-decrypt KAT")) {
+    fprintf(stderr,
+            "AES-GCM-decrypt KAT failed because EVP_AEAD_CTX_open failed.\n");
     goto err;
   }
 
   // SHA-1 KAT
-  SHA1(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA1, output, sizeof(kPlaintextSHA1),
+  static const uint8_t kSHA1Input[16] = {
+      0x13, 0x2f, 0xd9, 0xba, 0xd5, 0xc1, 0x82, 0x62,
+      0x63, 0xba, 0xfb, 0xb6, 0x99, 0xf7, 0x07, 0xa5,
+  };
+  static const uint8_t kSHA1Digest[20] = {
+      0x94, 0x19, 0x55, 0x93, 0x0a, 0x58, 0x29, 0x38, 0xeb, 0xf5,
+      0x09, 0x11, 0x6d, 0x1a, 0xfd, 0x0f, 0x1e, 0x11, 0xe3, 0xcb,
+  };
+  SHA1(kSHA1Input, sizeof(kSHA1Input), output);
+  if (!check_test(kSHA1Digest, output, sizeof(kSHA1Digest),
                   "SHA-1 KAT")) {
     goto err;
   }
 
-  // SHA-256 KAT
-  SHA256(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256),
-                  "SHA-256 KAT")) {
-    goto err;
-  }
-
-  // SHA-512 KAT
-  SHA512(kPlaintext, sizeof(kPlaintext), output);
-  if (!check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512),
-                  "SHA-512 KAT")) {
-    goto err;
-  }
-
-  rsa_key = self_test_rsa_key();
-  if (rsa_key == NULL) {
-    fprintf(stderr, "RSA KeyGen failed\n");
-    goto err;
-  }
-
-  // RSA Sign KAT
-  unsigned sig_len;
-
-  // Disable blinding for the power-on tests because it's not needed and
-  // triggers an entropy draw.
-  rsa_key->flags |= RSA_FLAG_NO_BLINDING;
-
-  if (!RSA_sign(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256), output,
-                &sig_len, rsa_key) ||
-      !check_test(kRSASignature, output, sizeof(kRSASignature),
-                  "RSA Sign KAT")) {
-    fprintf(stderr, "RSA signing test failed.\n");
-    goto err;
-  }
-
-  // RSA Verify KAT
-  if (!RSA_verify(NID_sha256, kPlaintextSHA256, sizeof(kPlaintextSHA256),
-                  kRSASignature, sizeof(kRSASignature), rsa_key)) {
-    fprintf(stderr, "RSA Verify KAT failed.\n");
-    goto err;
-  }
-
-  ec_key = self_test_ecdsa_key();
-  if (ec_key == NULL) {
-    fprintf(stderr, "ECDSA KeyGen failed\n");
-    goto err;
-  }
-
-  // ECDSA Sign/Verify KAT
-
-  // The 'k' value for ECDSA is fixed to avoid an entropy draw.
-  uint8_t ecdsa_k[32] = {0};
-  ecdsa_k[31] = 42;
-
-  sig = ecdsa_sign_with_nonce_for_known_answer_test(
-      kPlaintextSHA256, sizeof(kPlaintextSHA256), ec_key, ecdsa_k,
-      sizeof(ecdsa_k));
-
-  uint8_t ecdsa_r_bytes[sizeof(kECDSASigR)];
-  uint8_t ecdsa_s_bytes[sizeof(kECDSASigS)];
-  if (sig == NULL ||
-      BN_num_bytes(sig->r) != sizeof(ecdsa_r_bytes) ||
-      !BN_bn2bin(sig->r, ecdsa_r_bytes) ||
-      BN_num_bytes(sig->s) != sizeof(ecdsa_s_bytes) ||
-      !BN_bn2bin(sig->s, ecdsa_s_bytes) ||
-      !check_test(kECDSASigR, ecdsa_r_bytes, sizeof(kECDSASigR), "ECDSA R") ||
-      !check_test(kECDSASigS, ecdsa_s_bytes, sizeof(kECDSASigS), "ECDSA S")) {
-    fprintf(stderr, "ECDSA signature KAT failed.\n");
-    goto err;
-  }
-
-  if (!ECDSA_do_verify(kPlaintextSHA256, sizeof(kPlaintextSHA256), sig,
-                       ec_key)) {
-    fprintf(stderr, "ECDSA verification KAT failed.\n");
-    goto err;
-  }
-
-  // Primitive Z Computation KAT (IG 9.6).
-  ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1);
-  if (ec_group == NULL) {
-    fprintf(stderr, "Failed to create P-256 group.\n");
-    goto err;
-  }
-  ec_point_in = EC_POINT_new(ec_group);
-  ec_point_out = EC_POINT_new(ec_group);
-  ec_scalar = BN_new();
-  uint8_t z_comp_result[65];
-  if (ec_point_in == NULL || ec_point_out == NULL || ec_scalar == NULL ||
-      !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point),
-                          NULL) ||
-      !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) ||
-      !EC_POINT_mul(ec_group, ec_point_out, NULL, ec_point_in, ec_scalar,
-                    NULL) ||
-      !EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED,
-                          z_comp_result, sizeof(z_comp_result), NULL) ||
-      !check_test(kP256PointResult, z_comp_result, sizeof(z_comp_result),
-                  "Z Computation Result")) {
-    fprintf(stderr, "Z Computation KAT failed.\n");
-    goto err;
-  }
-
-  // FFC Diffie-Hellman KAT
-
-  BIGNUM *const ffdhe2048_value = BN_new();
-  DH *const dh = self_test_dh();
-  int dh_ok = 0;
-  if (ffdhe2048_value && dh) {
-    bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData,
-                        OPENSSL_ARRAY_SIZE(kFFDHE2048PublicValueData));
-
-    uint8_t dh_out[sizeof(kDHOutput)];
-    dh_ok =
-        sizeof(dh_out) == DH_size(dh) &&
-        DH_compute_key_padded(dh_out, ffdhe2048_value, dh) == sizeof(dh_out) &&
-        check_test(kDHOutput, dh_out, sizeof(dh_out), "FFC DH");
-  }
-
-  BN_free(ffdhe2048_value);
-  DH_free(dh);
-  if (!dh_ok) {
-    fprintf(stderr, "FFDH failed.\n");
+  if (!boringssl_self_test_sha256() ||
+      !boringssl_self_test_sha512() ||
+      !boringssl_self_test_hmac_sha256()) {
     goto err;
   }
 
   // DBRG KAT
+  static const uint8_t kDRBGEntropy[48] = {
+      0xc4, 0xda, 0x07, 0x40, 0xd5, 0x05, 0xf1, 0xee, 0x28, 0x0b, 0x95, 0xe5,
+      0x8c, 0x49, 0x31, 0xac, 0x6d, 0xe8, 0x46, 0xa0, 0x15, 0x2f, 0xbb, 0x4a,
+      0x3f, 0x17, 0x4c, 0xf4, 0x78, 0x7a, 0x4f, 0x1a, 0x40, 0xc2, 0xb5, 0x0b,
+      0xab, 0xe1, 0x4a, 0xae, 0x53, 0x0b, 0xe5, 0x88, 0x6d, 0x91, 0x0a, 0x27,
+  };
+  static const uint8_t kDRBGPersonalization[18] = "BCMPersonalization";
+  static const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD ";
+  static const uint8_t kDRBGOutput[64] = {
+      0x19, 0x1f, 0x2b, 0x49, 0x76, 0x85, 0xfd, 0x51, 0xb6, 0x56, 0xbc,
+      0x1c, 0x7d, 0xd5, 0xdd, 0x44, 0x76, 0xa3, 0x5e, 0x17, 0x9b, 0x8e,
+      0xb8, 0x98, 0x65, 0x12, 0xca, 0x35, 0x6c, 0xa0, 0x6f, 0xa0, 0x22,
+      0xe4, 0xf6, 0xd8, 0x43, 0xed, 0x4e, 0x2d, 0x97, 0x39, 0x43, 0x3b,
+      0x57, 0xfc, 0x23, 0x3f, 0x71, 0x0a, 0xe0, 0xed, 0xfe, 0xd5, 0xb8,
+      0x67, 0x7a, 0x00, 0x39, 0xb2, 0x6e, 0xa9, 0x25, 0x97,
+  };
+  static const uint8_t kDRBGEntropy2[48] = {
+      0xc7, 0x16, 0x1c, 0xa3, 0x6c, 0x23, 0x09, 0xb7, 0x16, 0xe9, 0x85, 0x9b,
+      0xb9, 0x6c, 0x6d, 0x49, 0xbd, 0xc8, 0x35, 0x21, 0x03, 0xa1, 0x8c, 0xd2,
+      0x4e, 0xf4, 0x2e, 0xc9, 0x7e, 0xf4, 0x6b, 0xf4, 0x46, 0xeb, 0x1a, 0x45,
+      0x76, 0xc1, 0x86, 0xe9, 0x35, 0x18, 0x03, 0x76, 0x3a, 0x79, 0x12, 0xfe,
+  };
+  static const uint8_t kDRBGReseedOutput[64] = {
+      0x00, 0xf2, 0x05, 0xaa, 0xfd, 0x11, 0x6c, 0x77, 0xbc, 0x81, 0x86,
+      0x99, 0xca, 0x51, 0xcf, 0x80, 0x15, 0x9f, 0x02, 0x9e, 0x0b, 0xcd,
+      0x26, 0xc8, 0x4b, 0x87, 0x8a, 0x15, 0x1a, 0xdd, 0xf2, 0xf3, 0xeb,
+      0x94, 0x0b, 0x08, 0xc8, 0xc9, 0x57, 0xa4, 0x0b, 0x4b, 0x0f, 0x13,
+      0xde, 0x7c, 0x0c, 0x6a, 0xac, 0x34, 0x4a, 0x9a, 0xf2, 0xd0, 0x83,
+      0x02, 0x05, 0x17, 0xc9, 0x81, 0x8f, 0x2a, 0x81, 0x92,
+  };
   CTR_DRBG_STATE drbg;
   if (!CTR_DRBG_init(&drbg, kDRBGEntropy, kDRBGPersonalization,
                      sizeof(kDRBGPersonalization)) ||
       !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGOutput), kDRBGAD,
                          sizeof(kDRBGAD)) ||
       !check_test(kDRBGOutput, output, sizeof(kDRBGOutput),
-                  "DBRG Generate KAT") ||
+                  "DRBG Generate KAT") ||
       !CTR_DRBG_reseed(&drbg, kDRBGEntropy2, kDRBGAD, sizeof(kDRBGAD)) ||
       !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGReseedOutput), kDRBGAD,
                          sizeof(kDRBGAD)) ||
       !check_test(kDRBGReseedOutput, output, sizeof(kDRBGReseedOutput),
-                  "DRBG Reseed KAT")) {
+                  "DRBG-reseed KAT")) {
     fprintf(stderr, "CTR-DRBG failed.\n");
     goto err;
   }
@@ -832,43 +905,59 @@
   }
 
   // TLS KDF KAT
+  static const uint8_t kTLSSecret[32] = {
+      0xab, 0xc3, 0x65, 0x7b, 0x09, 0x4c, 0x76, 0x28, 0xa0, 0xb2, 0x82,
+      0x99, 0x6f, 0xe7, 0x5a, 0x75, 0xf4, 0x98, 0x4f, 0xd9, 0x4d, 0x4e,
+      0xcc, 0x2f, 0xcf, 0x53, 0xa2, 0xc4, 0x69, 0xa3, 0xf7, 0x31,
+  };
+  static const char kTLSLabel[] = "FIPS self test";
+  static const uint8_t kTLSSeed1[16] = {
+      0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2,
+      0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65,
+  };
+  static const uint8_t kTLSSeed2[16] = {
+      0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c,
+      0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81,
+  };
+  static const uint8_t kTLSOutput[32] = {
+      0xe2, 0x1d, 0xd6, 0xc2, 0x68, 0xc7, 0x57, 0x03, 0x2c, 0x2c, 0xeb,
+      0xbb, 0xb8, 0xa9, 0x7d, 0xe9, 0xee, 0xe6, 0xc9, 0x47, 0x83, 0x0a,
+      0xbd, 0x11, 0x60, 0x5d, 0xd5, 0x2c, 0x47, 0xb6, 0x05, 0x88,
+  };
   uint8_t tls_output[sizeof(kTLSOutput)];
   if (!CRYPTO_tls1_prf(EVP_sha256(), tls_output, sizeof(tls_output), kTLSSecret,
                        sizeof(kTLSSecret), kTLSLabel, sizeof(kTLSLabel),
                        kTLSSeed1, sizeof(kTLSSeed1), kTLSSeed2,
                        sizeof(kTLSSeed2)) ||
-      !check_test(kTLSOutput, tls_output, sizeof(kTLSOutput), "TLS KDF KAT")) {
+      !check_test(kTLSOutput, tls_output, sizeof(kTLSOutput), "TLS-KDF KAT")) {
     fprintf(stderr, "TLS KDF failed.\n");
     goto err;
   }
 
   ret = 1;
 
-#if defined(BORINGSSL_FIPS_SELF_TEST_FLAG_FILE)
-  // Tests were successful. Write flag file if requested.
-  if (module_hash_len != 0 && getenv(kFlagWriteEnableEnvVar) != NULL) {
-    const int fd = open(flag_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
-    if (fd >= 0) {
-      close(fd);
-    }
-  }
-#endif  // BORINGSSL_FIPS_SELF_TEST_FLAG_FILE
-
 err:
   EVP_AEAD_CTX_cleanup(&aead_ctx);
-  RSA_free(rsa_key);
-  EC_KEY_free(ec_key);
-  EC_POINT_free(ec_point_in);
-  EC_POINT_free(ec_point_out);
-  EC_GROUP_free(ec_group);
-  BN_free(ec_scalar);
-  ECDSA_SIG_free(sig);
 
   return ret;
 }
 
 int BORINGSSL_self_test(void) {
-  return boringssl_fips_self_test(NULL, 0);
+  if (!boringssl_self_test_fast() ||
+      // When requested to run self tests, also run the lazy tests.
+      !boringssl_self_test_rsa() ||
+      !boringssl_self_test_ecc() ||
+      !boringssl_self_test_ffdh()) {
+    return 0;
+  }
+
+  return 1;
 }
 
+#if defined(BORINGSSL_FIPS)
+int boringssl_self_test_startup(void) {
+  return boringssl_self_test_fast();
+}
+#endif
+
 #endif  // !_MSC_VER
diff --git a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 61f67cb..2abd065 100755
--- a/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/src/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -126,15 +126,12 @@
 # versions, but BoringSSL is intended to be used with pre-generated perlasm
 # output, so this isn't useful anyway.
 #
-# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
-# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
-# did not tie them together until after $shaext was added.
+# This file also has an AVX2 implementation, controlled by setting $avx to 2.
+# For now, we intentionally disable it. While it gives a 13-16% perf boost, the
+# CFI annotations are wrong. It allocates stack in a loop and should be
+# rewritten to avoid this.
 $avx = 1;
-
-# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
-# been tested.
-$shaext=0;	### set to zero if compiling for 1.0.1
-$avx=1		if (!$shaext && $avx);
+$shaext = 1;
 
 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;
@@ -275,7 +272,7 @@
 ___
 $code.=<<___ if ($SZ==4 && $shaext);
 	test	\$`1<<29`,%r11d		# check for SHA
-	jnz	_shaext_shortcut
+	jnz	.Lshaext_shortcut
 ___
     # XOP codepath removed.
 $code.=<<___ if ($avx>1);
@@ -559,7 +556,8 @@
 .type	sha256_block_data_order_shaext,\@function,3
 .align	64
 sha256_block_data_order_shaext:
-_shaext_shortcut:
+.cfi_startproc
+.Lshaext_shortcut:
 ___
 $code.=<<___ if ($win64);
 	lea	`-8-5*16`(%rsp),%rsp
@@ -703,6 +701,7 @@
 ___
 $code.=<<___;
 	ret
+.cfi_endproc
 .size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
 ___
 }}}
diff --git a/src/crypto/hpke/hpke.c b/src/crypto/hpke/hpke.c
index c71ac2a..827ffaa 100644
--- a/src/crypto/hpke/hpke.c
+++ b/src/crypto/hpke/hpke.c
@@ -30,7 +30,7 @@
 #include "../internal.h"
 
 
-// This file implements draft-irtf-cfrg-hpke-12.
+// This file implements RFC 9180.
 
 #define MAX_SEED_LEN X25519_PRIVATE_KEY_LEN
 #define MAX_SHARED_SECRET_LEN SHA256_DIGEST_LENGTH
@@ -115,7 +115,7 @@
 // KEM implementations.
 
 // dhkem_extract_and_expand implements the ExtractAndExpand operation in the
-// DHKEM construction. See section 4.1 of draft-irtf-cfrg-hpke-12.
+// DHKEM construction. See section 4.1 of RFC 9180.
 static int dhkem_extract_and_expand(uint16_t kem_id, const EVP_MD *hkdf_md,
                                     uint8_t *out_key, size_t out_len,
                                     const uint8_t *dh, size_t dh_len,
diff --git a/src/crypto/hpke/translate_test_vectors.py b/src/crypto/hpke/translate_test_vectors.py
index a4e399b..a1fffcf 100755
--- a/src/crypto/hpke/translate_test_vectors.py
+++ b/src/crypto/hpke/translate_test_vectors.py
@@ -19,7 +19,7 @@
 Usage: translate_test_vectors.py TEST_VECTORS_JSON_FILE
 
 The TEST_VECTORS_JSON_FILE is expected to come from the JSON copy of
-draft-irtf-cfrg-hpke-12's test vectors, linked from its [TestVectors] citation.
+RFC 9180's test vectors, linked from its [TestVectors] citation.
 The output is written to "hpke_test_vectors.txt".
 """
 
diff --git a/src/crypto/hrss/hrss.c b/src/crypto/hrss/hrss.c
index 8e21068..388c9a9 100644
--- a/src/crypto/hrss/hrss.c
+++ b/src/crypto/hrss/hrss.c
@@ -1314,8 +1314,7 @@
 static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r,
                      const struct poly *a, const struct poly *b) {
 #if defined(POLY_RQ_MUL_ASM)
-  const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
-  if (has_avx2) {
+  if (CRYPTO_is_AVX2_capable()) {
     poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq);
     return;
   }
diff --git a/src/crypto/hrss/hrss_test.cc b/src/crypto/hrss/hrss_test.cc
index 0693c82..bab968c 100644
--- a/src/crypto/hrss/hrss_test.cc
+++ b/src/crypto/hrss/hrss_test.cc
@@ -453,8 +453,7 @@
 
 #if defined(POLY_RQ_MUL_ASM) && defined(SUPPORTS_ABI_TEST)
 TEST(HRSS, ABI) {
-  const bool has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
-  if (!has_avx2) {
+  if (!CRYPTO_is_AVX2_capable()) {
     fprintf(stderr, "Skipping ABI test due to lack of AVX2 support.\n");
     return;
   }
diff --git a/src/crypto/impl_dispatch_test.cc b/src/crypto/impl_dispatch_test.cc
index dae9e96..631e78f 100644
--- a/src/crypto/impl_dispatch_test.cc
+++ b/src/crypto/impl_dispatch_test.cc
@@ -33,9 +33,9 @@
  public:
   void SetUp() override {
 #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-    aesni_ = OPENSSL_ia32cap_P[1] & (1 << (57 - 32));
-    avx_movbe_ = ((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41;
-    ssse3_ = OPENSSL_ia32cap_P[1] & (1 << (41 - 32));
+    aesni_ = CRYPTO_is_AESNI_capable();
+    avx_movbe_ = CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable();
+    ssse3_ = CRYPTO_is_SSSE3_capable();
     is_x86_64_ =
 #if defined(OPENSSL_X86_64)
         true;
diff --git a/src/crypto/internal.h b/src/crypto/internal.h
index 42f94d5..78dbbbf 100644
--- a/src/crypto/internal.h
+++ b/src/crypto/internal.h
@@ -121,6 +121,10 @@
 #include <valgrind/memcheck.h>
 #endif
 
+#if defined(BORINGSSL_FIPS_BREAK_TESTS)
+#include <stdlib.h>
+#endif
+
 #if !defined(__cplusplus)
 #if defined(_MSC_VER)
 #define alignas(x) __declspec(align(x))
@@ -932,19 +936,50 @@
 // FIPS functions.
 
 #if defined(BORINGSSL_FIPS)
+
 // BORINGSSL_FIPS_abort is called when a FIPS power-on or continuous test
 // fails. It prevents any further cryptographic operations by the current
 // process.
 void BORINGSSL_FIPS_abort(void) __attribute__((noreturn));
-#endif
 
-// boringssl_fips_self_test runs the FIPS KAT-based self tests. It returns one
-// on success and zero on error. The argument is the integrity hash of the FIPS
-// module and may be used to check and write flag files to suppress duplicate
-// self-tests. If |module_hash_len| is zero then no flag file will be checked
-// nor written and tests will always be run.
-int boringssl_fips_self_test(const uint8_t *module_hash,
-                             size_t module_hash_len);
+// boringssl_self_test_startup runs all startup self tests and returns one on
+// success or zero on error. Startup self tests do not include lazy tests.
+// Call |BORINGSSL_self_test| to run every self test.
+int boringssl_self_test_startup(void);
+
+// boringssl_ensure_rsa_self_test checks whether the RSA self-test has been run
+// in this address space. If not, it runs it and crashes the address space if
+// unsuccessful.
+void boringssl_ensure_rsa_self_test(void);
+
+// boringssl_ensure_ecc_self_test checks whether the ECDSA and ECDH self-test
+// has been run in this address space. If not, it runs it and crashes the
+// address space if unsuccessful.
+void boringssl_ensure_ecc_self_test(void);
+
+// boringssl_ensure_ffdh_self_test checks whether the FFDH self-test has been
+// run in this address space. If not, it runs it and crashes the address space
+// if unsuccessful.
+void boringssl_ensure_ffdh_self_test(void);
+
+#else
+
+// Outside of FIPS mode, the lazy tests are no-ops.
+
+OPENSSL_INLINE void boringssl_ensure_rsa_self_test(void) {}
+OPENSSL_INLINE void boringssl_ensure_ecc_self_test(void) {}
+OPENSSL_INLINE void boringssl_ensure_ffdh_self_test(void) {}
+
+#endif  // FIPS
+
+// boringssl_self_test_sha256 performs a SHA-256 KAT.
+int boringssl_self_test_sha256(void);
+
+// boringssl_self_test_sha512 performs a SHA-512 KAT.
+int boringssl_self_test_sha512(void);
+
+// boringssl_self_test_hmac_sha256 performs an HMAC-SHA-256 KAT.
+int boringssl_self_test_hmac_sha256(void);
 
 #if defined(BORINGSSL_FIPS_COUNTERS)
 void boringssl_fips_inc_counter(enum fips_counter_t counter);
@@ -952,6 +987,17 @@
 OPENSSL_INLINE void boringssl_fips_inc_counter(enum fips_counter_t counter) {}
 #endif
 
+#if defined(BORINGSSL_FIPS_BREAK_TESTS)
+OPENSSL_INLINE int boringssl_fips_break_test(const char *test) {
+  const char *const value = getenv("BORINGSSL_FIPS_BREAK_TEST");
+  return value != NULL && strcmp(value, test) == 0;
+}
+#else
+OPENSSL_INLINE int boringssl_fips_break_test(const char *test) {
+  return 0;
+}
+#endif  // BORINGSSL_FIPS_BREAK_TESTS
+
 
 // Runtime CPU feature support
 
@@ -978,14 +1024,126 @@
 extern uint32_t OPENSSL_ia32cap_P[4];
 
 #if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY)
-const uint32_t *OPENSSL_ia32cap_get(void);
+// The FIPS module, as a static library, requires an out-of-line version of
+// |OPENSSL_ia32cap_get| so accesses can be rewritten by delocate. Mark the
+// function const so multiple accesses can be optimized together.
+const uint32_t *OPENSSL_ia32cap_get(void) __attribute__((const));
 #else
 OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) {
   return OPENSSL_ia32cap_P;
 }
 #endif
 
+// See Intel manual, volume 2A, table 3-11.
+
+OPENSSL_INLINE int CRYPTO_is_FXSR_capable(void) {
+#if defined(__FXSR__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[0] & (1 << 24)) != 0;
 #endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_intel_cpu(void) {
+  // The reserved bit 30 is used to indicate an Intel CPU.
+  return (OPENSSL_ia32cap_get()[0] & (1 << 30)) != 0;
+}
+
+// See Intel manual, volume 2A, table 3-10.
+
+OPENSSL_INLINE int CRYPTO_is_PCLMUL_capable(void) {
+#if defined(__PCLMUL__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 1)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSSE3_capable(void) {
+#if defined(__SSSE3__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 9)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_SSE4_1_capable(void) {
+#if defined(__SSE4_1__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_MOVBE_capable(void) {
+#if defined(__MOVBE__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 22)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) {
+#if defined(__AES__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) {
+#if defined(__AVX__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_RDRAND_capable(void) {
+  // The GCC/Clang feature name and preprocessor symbol for RDRAND are "rdrnd"
+  // and |__RDRND__|, respectively.
+#if defined(__RDRND__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0;
+#endif
+}
+
+// See Intel manual, volume 2A, table 3-8.
+
+OPENSSL_INLINE int CRYPTO_is_BMI1_capable(void) {
+#if defined(__BMI1__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 3)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_AVX2_capable(void) {
+#if defined(__AVX2__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 5)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_BMI2_capable(void) {
+#if defined(__BMI2__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 8)) != 0;
+#endif
+}
+
+OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) {
+#if defined(__ADX__)
+  return 1;
+#else
+  return (OPENSSL_ia32cap_get()[2] & (1 << 19)) != 0;
+#endif
+}
+
+#endif  // OPENSSL_X86 || OPENSSL_X86_64
 
 #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)
 
diff --git a/src/crypto/pem/pem_all.c b/src/crypto/pem/pem_all.c
index e419774..706b7f4 100644
--- a/src/crypto/pem/pem_all.c
+++ b/src/crypto/pem/pem_all.c
@@ -200,7 +200,7 @@
 IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA,
                              DSAPrivateKey)
 
-    IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
+IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY)
 DSA *PEM_read_DSAPrivateKey(FILE *fp, DSA **dsa, pem_password_cb *cb, void *u)
 {
     EVP_PKEY *pktmp;
@@ -237,7 +237,7 @@
 IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY,
                        ECPrivateKey)
 
-    IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
+IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY)
 EC_KEY *PEM_read_ECPrivateKey(FILE *fp, EC_KEY **eckey, pem_password_cb *cb,
                               void *u)
 {
@@ -247,6 +247,6 @@
 }
 
 
-IMPLEMENT_PEM_write_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
+IMPLEMENT_PEM_rw_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams)
 
-    IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
+IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY)
diff --git a/src/crypto/pem/pem_pkey.c b/src/crypto/pem/pem_pkey.c
index 48d8c96..f75486d 100644
--- a/src/crypto/pem/pem_pkey.c
+++ b/src/crypto/pem/pem_pkey.c
@@ -176,39 +176,3 @@
     BIO_free(b);
     return ret;
 }
-
-
-/* Transparently read in PKCS#3 or X9.42 DH parameters */
-
-DH *PEM_read_bio_DHparams(BIO *bp, DH **x, pem_password_cb *cb, void *u)
-{
-    char *nm = NULL;
-    const unsigned char *p = NULL;
-    unsigned char *data = NULL;
-    long len;
-    DH *ret = NULL;
-
-    if (!PEM_bytes_read_bio(&data, &len, &nm, PEM_STRING_DHPARAMS, bp, cb, u))
-        return NULL;
-    p = data;
-
-    ret = d2i_DHparams(x, &p, len);
-
-    if (ret == NULL)
-        OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB);
-    OPENSSL_free(nm);
-    OPENSSL_free(data);
-    return ret;
-}
-
-DH *PEM_read_DHparams(FILE *fp, DH **x, pem_password_cb *cb, void *u)
-{
-    BIO *b = BIO_new_fp(fp, BIO_NOCLOSE);
-    if (b == NULL) {
-        OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB);
-        return NULL;
-    }
-    DH *ret = PEM_read_bio_DHparams(b, x, cb, u);
-    BIO_free(b);
-    return ret;
-}
diff --git a/src/crypto/pkcs8/pkcs12_test.cc b/src/crypto/pkcs8/pkcs12_test.cc
index e67630d..958bd8d 100644
--- a/src/crypto/pkcs8/pkcs12_test.cc
+++ b/src/crypto/pkcs8/pkcs12_test.cc
@@ -34,7 +34,7 @@
 static const char kPassword[] = "foo";
 
 // kUnicodePassword is the password for unicode_password.p12
-static const char kUnicodePassword[] = u8"Hello, 世界";
+static const char kUnicodePassword[] = "Hello, 世界";
 
 static bssl::Span<const uint8_t> StringToBytes(const std::string &str) {
   return bssl::MakeConstSpan(reinterpret_cast<const uint8_t *>(str.data()),
@@ -391,7 +391,7 @@
                 {bssl::Span<const uint8_t>(kTestCert2)}, 0, 0, 0, 0);
 
   // Test some Unicode.
-  TestRoundTrip(kPassword, u8"Hello, 世界!",
+  TestRoundTrip(kPassword, "Hello, 世界!",
                 bssl::Span<const uint8_t>(kTestKey),
                 bssl::Span<const uint8_t>(kTestCert),
                 {bssl::Span<const uint8_t>(kTestCert2)}, 0, 0, 0, 0);
diff --git a/src/decrepit/des/cfb64ede.c b/src/decrepit/des/cfb64ede.c
index 6c39923..820c52e 100644
--- a/src/decrepit/des/cfb64ede.c
+++ b/src/decrepit/des/cfb64ede.c
@@ -58,7 +58,7 @@
 
 #include <openssl/des.h>
 
-#include "../../crypto/fipsmodule/des/internal.h"
+#include "../../crypto/des/internal.h"
 #include "../../crypto/internal.h"
 
 
diff --git a/src/include/openssl/hpke.h b/src/include/openssl/hpke.h
index 56251b7..e2c9855 100644
--- a/src/include/openssl/hpke.h
+++ b/src/include/openssl/hpke.h
@@ -30,7 +30,7 @@
 // Hybrid Public Key Encryption (HPKE) enables a sender to encrypt messages to a
 // receiver with a public key.
 //
-// See https://tools.ietf.org/html/draft-irtf-cfrg-hpke-12.
+// See RFC 9180.
 
 
 // Parameters.
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 232c627..a3b530e 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -4039,10 +4039,16 @@
 // |len| bytes from |buf| contain the handshake message, one-byte
 // ChangeCipherSpec body, and two-byte alert, respectively.
 //
+// In connections that enable ECH, |cb| is additionally called with
+// |content_type| = |SSL3_RT_CLIENT_HELLO_INNER| for each ClientHelloInner that
+// is encrypted or decrypted. The |len| bytes from |buf| contain the
+// ClientHelloInner, including the reconstructed outer extensions and handshake
+// header.
+//
 // For a V2ClientHello, |version| is |SSL2_VERSION|, |content_type| is zero, and
 // the |len| bytes from |buf| contain the V2ClientHello structure.
 OPENSSL_EXPORT void SSL_CTX_set_msg_callback(
-    SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type,
+    SSL_CTX *ctx, void (*cb)(int is_write, int version, int content_type,
                              const void *buf, size_t len, SSL *ssl, void *arg));
 
 // SSL_CTX_set_msg_callback_arg sets the |arg| parameter of the message
@@ -5598,7 +5604,7 @@
 #define SSL_R_INVALID_ECH_PUBLIC_NAME 317
 #define SSL_R_INVALID_ECH_CONFIG_LIST 318
 #define SSL_R_ECH_REJECTED 319
-#define SSL_R_OUTER_EXTENSION_NOT_FOUND 320
+#define SSL_R_INVALID_OUTER_EXTENSION 320
 #define SSL_R_INCONSISTENT_ECH_NEGOTIATION 321
 #define SSL_R_SSLV3_ALERT_CLOSE_NOTIFY 1000
 #define SSL_R_SSLV3_ALERT_UNEXPECTED_MESSAGE 1010
diff --git a/src/include/openssl/ssl3.h b/src/include/openssl/ssl3.h
index e3910f0..533142c 100644
--- a/src/include/openssl/ssl3.h
+++ b/src/include/openssl/ssl3.h
@@ -275,6 +275,7 @@
 
 // Pseudo content type for SSL/TLS header info
 #define SSL3_RT_HEADER 0x100
+#define SSL3_RT_CLIENT_HELLO_INNER 0x101
 
 #define SSL3_AL_WARNING 1
 #define SSL3_AL_FATAL 2
diff --git a/src/ssl/encrypted_client_hello.cc b/src/ssl/encrypted_client_hello.cc
index 64fee3d..9e9adfe 100644
--- a/src/ssl/encrypted_client_hello.cc
+++ b/src/ssl/encrypted_client_hello.cc
@@ -203,6 +203,12 @@
         OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
         return false;
       }
+      // The ECH extension itself is not in the AAD and may not be referenced.
+      if (want == TLSEXT_TYPE_encrypted_client_hello) {
+        *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+        OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_OUTER_EXTENSION);
+        return false;
+      }
       // Seek to |want| in |outer_extensions|. |ext_list| is required to match
       // ClientHelloOuter in order.
       uint16_t found;
@@ -210,7 +216,7 @@
       do {
         if (CBS_len(&outer_extensions) == 0) {
           *out_alert = SSL_AD_ILLEGAL_PARAMETER;
-          OPENSSL_PUT_ERROR(SSL, SSL_R_OUTER_EXTENSION_NOT_FOUND);
+          OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_OUTER_EXTENSION);
           return false;
         }
         if (!CBS_get_u16(&outer_extensions, &found) ||
@@ -252,8 +258,8 @@
   return true;
 }
 
-bool ssl_client_hello_decrypt(EVP_HPKE_CTX *hpke_ctx, Array<uint8_t> *out,
-                              bool *out_is_decrypt_error,
+bool ssl_client_hello_decrypt(SSL_HANDSHAKE *hs, uint8_t *out_alert,
+                              bool *out_is_decrypt_error, Array<uint8_t> *out,
                               const SSL_CLIENT_HELLO *client_hello_outer,
                               Span<const uint8_t> payload) {
   *out_is_decrypt_error = false;
@@ -264,6 +270,7 @@
   Array<uint8_t> aad;
   if (!aad.CopyFrom(MakeConstSpan(client_hello_outer->client_hello,
                                   client_hello_outer->client_hello_len))) {
+    *out_alert = SSL_AD_INTERNAL_ERROR;
     return false;
   }
 
@@ -278,35 +285,47 @@
       payload.data() - client_hello_outer->client_hello, payload.size());
   OPENSSL_memset(payload_aad.data(), 0, payload_aad.size());
 
+  // Decrypt the EncodedClientHelloInner.
+  Array<uint8_t> encoded;
 #if defined(BORINGSSL_UNSAFE_FUZZER_MODE)
   // In fuzzer mode, disable encryption to improve coverage. We reserve a short
   // input to signal decryption failure, so the fuzzer can explore fallback to
   // ClientHelloOuter.
   const uint8_t kBadPayload[] = {0xff};
   if (payload == kBadPayload) {
+    *out_alert = SSL_AD_DECRYPT_ERROR;
     *out_is_decrypt_error = true;
     OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
     return false;
   }
-  if (!out->CopyFrom(payload)) {
+  if (!encoded.CopyFrom(payload)) {
+    *out_alert = SSL_AD_INTERNAL_ERROR;
     return false;
   }
 #else
-  // Attempt to decrypt into |out|.
-  if (!out->Init(payload.size())) {
-    OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+  if (!encoded.Init(payload.size())) {
+    *out_alert = SSL_AD_INTERNAL_ERROR;
     return false;
   }
   size_t len;
-  if (!EVP_HPKE_CTX_open(hpke_ctx, out->data(), &len, out->size(),
-                         payload.data(), payload.size(), aad.data(),
-                         aad.size())) {
+  if (!EVP_HPKE_CTX_open(hs->ech_hpke_ctx.get(), encoded.data(), &len,
+                         encoded.size(), payload.data(), payload.size(),
+                         aad.data(), aad.size())) {
+    *out_alert = SSL_AD_DECRYPT_ERROR;
     *out_is_decrypt_error = true;
     OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
     return false;
   }
-  out->Shrink(len);
+  encoded.Shrink(len);
 #endif
+
+  if (!ssl_decode_client_hello_inner(hs->ssl, out_alert, out, encoded,
+                                     client_hello_outer)) {
+    return false;
+  }
+
+  ssl_do_msg_callback(hs->ssl, /*is_write=*/0, SSL3_RT_CLIENT_HELLO_INNER,
+                      *out);
   return true;
 }
 
@@ -789,6 +808,8 @@
                    binder_len);
   }
 
+  ssl_do_msg_callback(ssl, /*is_write=*/1, SSL3_RT_CLIENT_HELLO_INNER,
+                      hello_inner);
   if (!hs->inner_transcript.Update(hello_inner)) {
     return false;
   }
diff --git a/src/ssl/handshake_client.cc b/src/ssl/handshake_client.cc
index 17b41e0..e630121 100644
--- a/src/ssl/handshake_client.cc
+++ b/src/ssl/handshake_client.cc
@@ -331,7 +331,7 @@
   Array<uint8_t> msg;
   if (!ssl->method->init_message(ssl, cbb.get(), &body, SSL3_MT_CLIENT_HELLO) ||
       !ssl_write_client_hello_without_extensions(hs, &body, type,
-                                                 /*empty_session_id*/ false) ||
+                                                 /*empty_session_id=*/false) ||
       !ssl_add_clienthello_tlsext(hs, &body, /*out_encoded=*/nullptr,
                                   &needs_psk_binder, type, CBB_len(&body)) ||
       !ssl->method->finish_message(ssl, cbb.get(), &msg)) {
diff --git a/src/ssl/handshake_server.cc b/src/ssl/handshake_server.cc
index 1d03c55..15820be 100644
--- a/src/ssl/handshake_server.cc
+++ b/src/ssl/handshake_server.cc
@@ -554,29 +554,22 @@
       ERR_clear_error();
       continue;
     }
-    Array<uint8_t> encoded_client_hello_inner;
     bool is_decrypt_error;
-    if (!ssl_client_hello_decrypt(hs->ech_hpke_ctx.get(),
-                                  &encoded_client_hello_inner,
-                                  &is_decrypt_error, client_hello, payload)) {
+    if (!ssl_client_hello_decrypt(hs, out_alert, &is_decrypt_error,
+                                  &hs->ech_client_hello_buf, client_hello,
+                                  payload)) {
       if (is_decrypt_error) {
         // Ignore the error and try another ECHConfig.
         ERR_clear_error();
+        // The |out_alert| calling convention currently relies on a default of
+        // |SSL_AD_DECODE_ERROR|. https://crbug.com/boringssl/373 tracks
+        // switching to sum types, which avoids this.
+        *out_alert = SSL_AD_DECODE_ERROR;
         continue;
       }
       OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
       return false;
     }
-
-    // Recover the ClientHelloInner from the EncodedClientHelloInner.
-    bssl::Array<uint8_t> client_hello_inner;
-    if (!ssl_decode_client_hello_inner(ssl, out_alert, &client_hello_inner,
-                                       encoded_client_hello_inner,
-                                       client_hello)) {
-      OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
-      return false;
-    }
-    hs->ech_client_hello_buf = std::move(client_hello_inner);
     hs->ech_config_id = config_id;
     ssl->s3->ech_status = ssl_ech_accepted;
     return true;
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index 5196f17..8f68fc5 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -1498,17 +1498,19 @@
 // ClientHelloOuter |client_hello_outer|. If successful, it writes the recovered
 // ClientHelloInner to |out_client_hello_inner|. It returns true on success and
 // false on failure.
+//
+// This function is exported for fuzzing.
 OPENSSL_EXPORT bool ssl_decode_client_hello_inner(
     SSL *ssl, uint8_t *out_alert, Array<uint8_t> *out_client_hello_inner,
     Span<const uint8_t> encoded_client_hello_inner,
     const SSL_CLIENT_HELLO *client_hello_outer);
 
-// ssl_client_hello_decrypt attempts to decrypt the |payload| and writes the
-// result to |*out|. |payload| must point into |client_hello_outer|. It returns
-// true on success and false on error. On error, it sets |*out_is_decrypt_error|
-// to whether the failure was due to a bad ciphertext.
-bool ssl_client_hello_decrypt(EVP_HPKE_CTX *hpke_ctx, Array<uint8_t> *out,
-                              bool *out_is_decrypt_error,
+// ssl_client_hello_decrypt attempts to decrypt and decode the |payload|. It
+// writes the result to |*out|. |payload| must point into |client_hello_outer|.
+// It returns true on success and false on error. On error, it sets
+// |*out_is_decrypt_error| to whether the failure was due to a bad ciphertext.
+bool ssl_client_hello_decrypt(SSL_HANDSHAKE *hs, uint8_t *out_alert,
+                              bool *out_is_decrypt_error, Array<uint8_t> *out,
                               const SSL_CLIENT_HELLO *client_hello_outer,
                               Span<const uint8_t> payload);
 
@@ -3511,7 +3513,7 @@
   bssl::UniquePtr<bssl::CERT> cert;
 
   // callback that allows applications to peek at protocol messages
-  void (*msg_callback)(int write_p, int version, int content_type,
+  void (*msg_callback)(int is_write, int version, int content_type,
                        const void *buf, size_t len, SSL *ssl,
                        void *arg) = nullptr;
   void *msg_callback_arg = nullptr;
diff --git a/src/ssl/test/CMakeLists.txt b/src/ssl/test/CMakeLists.txt
index bb9bd81..f02d6e2 100644
--- a/src/ssl/test/CMakeLists.txt
+++ b/src/ssl/test/CMakeLists.txt
@@ -17,7 +17,7 @@
 
 target_link_libraries(bssl_shim test_support_lib ssl crypto)
 
-if(UNIX AND NOT APPLE AND NOT ANDROID)
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
   add_executable(
     handshaker
 
diff --git a/src/ssl/test/runner/hpke/hpke.go b/src/ssl/test/runner/hpke/hpke.go
index e6fc7be..b65dcf3 100644
--- a/src/ssl/test/runner/hpke/hpke.go
+++ b/src/ssl/test/runner/hpke/hpke.go
@@ -14,7 +14,7 @@
 
 // Package hpke implements Hybrid Public Key Encryption (HPKE).
 //
-// See https://tools.ietf.org/html/draft-irtf-cfrg-hpke-12.
+// See RFC 9180.
 package hpke
 
 import (
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index cfff714..4c1c955 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -16776,9 +16776,7 @@
 				},
 				shouldFail:         true,
 				expectedLocalError: "remote error: illegal parameter",
-				// The decoding algorithm relies on the ordering requirement, so
-				// the wrong order appears as a missing extension.
-				expectedError: ":OUTER_EXTENSION_NOT_FOUND:",
+				expectedError:      ":INVALID_OUTER_EXTENSION:",
 			})
 
 			// Test that the server rejects duplicated values in ech_outer_extensions.
@@ -16812,9 +16810,7 @@
 				},
 				shouldFail:         true,
 				expectedLocalError: "remote error: illegal parameter",
-				// The decoding algorithm relies on the ordering requirement, so
-				// duplicates appear as missing extensions.
-				expectedError: ":OUTER_EXTENSION_NOT_FOUND:",
+				expectedError:      ":INVALID_OUTER_EXTENSION:",
 			})
 
 			// Test that the server rejects references to missing extensions in
@@ -16843,7 +16839,7 @@
 				},
 				shouldFail:         true,
 				expectedLocalError: "remote error: illegal parameter",
-				expectedError:      ":DECODE_ERROR:",
+				expectedError:      ":INVALID_OUTER_EXTENSION:",
 			})
 
 			// Test that the server rejects a references to the ECH extension in
@@ -16871,7 +16867,46 @@
 				},
 				shouldFail:         true,
 				expectedLocalError: "remote error: illegal parameter",
-				expectedError:      ":DECODE_ERROR:",
+				expectedError:      ":INVALID_OUTER_EXTENSION:",
+			})
+
+			// Test the message callback is correctly reported with ECH.
+			clientAndServerHello := "read hs 1\nread clienthelloinner\nwrite hs 2\n"
+			expectMsgCallback := clientAndServerHello + "write ccs\n"
+			if hrr {
+				expectMsgCallback += clientAndServerHello
+			}
+			// EncryptedExtensions onwards.
+			expectMsgCallback += `write hs 8
+write hs 11
+write hs 15
+write hs 20
+read hs 20
+write hs 4
+write hs 4
+`
+			testCases = append(testCases, testCase{
+				testType: serverTest,
+				protocol: protocol,
+				name:     prefix + "ECH-Server-MessageCallback" + suffix,
+				config: Config{
+					ServerName:      "secret.example",
+					ClientECHConfig: echConfig.ECHConfig,
+					DefaultCurves:   defaultCurves,
+					Bugs: ProtocolBugs{
+						NoCloseNotify: true, // Align QUIC and TCP traces.
+					},
+				},
+				flags: []string{
+					"-ech-server-config", base64FlagValue(echConfig.ECHConfig.Raw),
+					"-ech-server-key", base64FlagValue(echConfig.Key),
+					"-ech-is-retry-config", "1",
+					"-expect-ech-accept",
+					"-expect-msg-callback", expectMsgCallback,
+				},
+				expectations: connectionExpectations{
+					echAccepted: true,
+				},
 			})
 		}
 
@@ -18622,6 +18657,60 @@
 			shouldFail:    true,
 			expectedError: ":INCONSISTENT_ECH_NEGOTIATION:",
 		})
+
+		// Test the message callback is correctly reported, with and without
+		// HelloRetryRequest.
+		clientAndServerHello := "write clienthelloinner\nwrite hs 1\nread hs 2\n"
+		// EncryptedExtensions onwards.
+		finishHandshake := `read hs 8
+read hs 11
+read hs 15
+read hs 20
+write hs 20
+read hs 4
+read hs 4
+`
+		testCases = append(testCases, testCase{
+			testType: clientTest,
+			protocol: protocol,
+			name:     prefix + "ECH-Client-MessageCallback",
+			config: Config{
+				MinVersion:       VersionTLS13,
+				MaxVersion:       VersionTLS13,
+				ServerECHConfigs: []ServerECHConfig{echConfig},
+				Bugs: ProtocolBugs{
+					NoCloseNotify: true, // Align QUIC and TCP traces.
+				},
+			},
+			flags: []string{
+				"-ech-config-list", base64FlagValue(CreateECHConfigList(echConfig.ECHConfig.Raw)),
+				"-expect-ech-accept",
+				"-expect-msg-callback", clientAndServerHello + "write ccs\n" + finishHandshake,
+			},
+			expectations: connectionExpectations{echAccepted: true},
+		})
+		testCases = append(testCases, testCase{
+			testType: clientTest,
+			protocol: protocol,
+			name:     prefix + "ECH-Client-MessageCallback-HelloRetryRequest",
+			config: Config{
+				MinVersion:       VersionTLS13,
+				MaxVersion:       VersionTLS13,
+				CurvePreferences: []CurveID{CurveP384},
+				ServerECHConfigs: []ServerECHConfig{echConfig},
+				Bugs: ProtocolBugs{
+					ExpectMissingKeyShare: true, // Check we triggered HRR.
+					NoCloseNotify:         true, // Align QUIC and TCP traces.
+				},
+			},
+			flags: []string{
+				"-ech-config-list", base64FlagValue(CreateECHConfigList(echConfig.ECHConfig.Raw)),
+				"-expect-ech-accept",
+				"-expect-hrr", // Check we triggered HRR.
+				"-expect-msg-callback", clientAndServerHello + "write ccs\n" + clientAndServerHello + finishHandshake,
+			},
+			expectations: connectionExpectations{echAccepted: true},
+		})
 	}
 }
 
@@ -19220,8 +19309,22 @@
 		noneOfPattern = strings.Split(*skipTest, ";")
 	}
 
+	shardIndex, shardTotal, err := getSharding()
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+
+	if shardTotal > 0 {
+		fmt.Printf("This is shard %d of 0..%d (inclusive)\n", shardIndex, shardTotal-1)
+	}
+
 	var foundTest bool
 	for i := range testCases {
+		if shardTotal > 0 && i%shardTotal != shardIndex {
+			continue
+		}
+
 		matched, err := match(oneOfPatternIfAny, noneOfPattern, testCases[i].name)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "Error matching pattern: %s\n", err)
@@ -19259,7 +19362,7 @@
 		}
 	}
 
-	if !foundTest {
+	if !foundTest && shardTotal == 0 {
 		fmt.Fprintf(os.Stderr, "No tests run\n")
 		os.Exit(1)
 	}
diff --git a/src/ssl/test/runner/sharding.go b/src/ssl/test/runner/sharding.go
new file mode 100644
index 0000000..5061a6f
--- /dev/null
+++ b/src/ssl/test/runner/sharding.go
@@ -0,0 +1,77 @@
+// Copyright (c) 2022, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+package runner
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"strconv"
+)
+
+const (
+	shardStatusFileEnv = "TEST_SHARD_STATUS_FILE"
+	shardTotalEnv      = "TEST_TOTAL_SHARDS"
+	shardIndexEnv      = "TEST_SHARD_INDEX"
+	shardPrefix        = "RUNNER_"
+)
+
+func init() {
+	// When run under `go test`, init() functions may be run twice if the
+	// test binary ends up forking and execing itself. Therefore we move
+	// the environment variables to names that don't interfere with Go's
+	// own support for sharding. If we recorded and erased them, then they
+	// wouldn't exist the second time the binary runs.
+	for _, key := range []string{shardStatusFileEnv, shardTotalEnv, shardIndexEnv} {
+		value := os.Getenv(key)
+		if len(value) > 0 {
+			os.Setenv(shardPrefix+key, value)
+			os.Setenv(key, "")
+		}
+	}
+}
+
+// getSharding returns the shard index and count, or zeros if sharding is not
+// enabled.
+func getSharding() (index, total int, err error) {
+	statusFile := os.Getenv(shardPrefix + shardStatusFileEnv)
+	totalNumStr := os.Getenv(shardPrefix + shardTotalEnv)
+	indexStr := os.Getenv(shardPrefix + shardIndexEnv)
+	if len(totalNumStr) == 0 || len(indexStr) == 0 {
+		return 0, 0, nil
+	}
+
+	totalNum, err := strconv.Atoi(totalNumStr)
+	if err != nil {
+		return 0, 0, fmt.Errorf("$%s is %q, but expected a number\n", shardTotalEnv, totalNumStr)
+	}
+
+	index, err = strconv.Atoi(indexStr)
+	if err != nil {
+		return 0, 0, fmt.Errorf("$%s is %q, but expected a number\n", shardIndexEnv, indexStr)
+	}
+
+	if index < 0 || index >= totalNum {
+		return 0, 0, fmt.Errorf("shard index/total of %d/%d is invalid\n", index, totalNum)
+	}
+
+	if len(statusFile) > 0 {
+		if err := ioutil.WriteFile(statusFile, nil, 0664); err != nil {
+			return 0, 0, err
+		}
+	}
+
+	return index, totalNum, nil
+}
diff --git a/src/ssl/test/test_config.cc b/src/ssl/test/test_config.cc
index 9a0f63d..a6409d6 100644
--- a/src/ssl/test/test_config.cc
+++ b/src/ssl/test/test_config.cc
@@ -602,6 +602,7 @@
       state->msg_callback_text += "v2clienthello\n";
       return;
 
+    case SSL3_RT_CLIENT_HELLO_INNER:
     case SSL3_RT_HANDSHAKE: {
       CBS cbs;
       CBS_init(&cbs, buf_u8, len);
@@ -619,10 +620,19 @@
         return;
       }
       char text[16];
-      snprintf(text, sizeof(text), "hs %d\n", type);
-      state->msg_callback_text += text;
-      if (!is_write) {
-        state->last_message_received = type;
+      if (content_type == SSL3_RT_CLIENT_HELLO_INNER) {
+        if (type != SSL3_MT_CLIENT_HELLO) {
+          fprintf(stderr, "Invalid header for ClientHelloInner.\n");
+          state->msg_callback_ok = false;
+          return;
+        }
+        state->msg_callback_text += "clienthelloinner\n";
+      } else {
+        snprintf(text, sizeof(text), "hs %d\n", type);
+        state->msg_callback_text += text;
+        if (!is_write) {
+          state->last_message_received = type;
+        }
       }
       return;
     }
diff --git a/src/ssl/tls13_server.cc b/src/ssl/tls13_server.cc
index 2f000e5..dbf239d 100644
--- a/src/ssl/tls13_server.cc
+++ b/src/ssl/tls13_server.cc
@@ -658,28 +658,16 @@
     }
 
     // Decrypt the payload with the HPKE context from the first ClientHello.
-    Array<uint8_t> encoded_client_hello_inner;
+    uint8_t alert = SSL_AD_DECODE_ERROR;
     bool unused;
-    if (!ssl_client_hello_decrypt(hs->ech_hpke_ctx.get(),
-                                  &encoded_client_hello_inner, &unused,
-                                  &client_hello, payload)) {
+    if (!ssl_client_hello_decrypt(hs, &alert, &unused,
+                                  &hs->ech_client_hello_buf, &client_hello,
+                                  payload)) {
       // Decryption failure is fatal in the second ClientHello.
       OPENSSL_PUT_ERROR(SSL, SSL_R_DECRYPTION_FAILED);
-      ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECRYPT_ERROR);
-      return ssl_hs_error;
-    }
-
-    // Recover the ClientHelloInner from the EncodedClientHelloInner.
-    uint8_t alert = SSL_AD_DECODE_ERROR;
-    bssl::Array<uint8_t> client_hello_inner;
-    if (!ssl_decode_client_hello_inner(ssl, &alert, &client_hello_inner,
-                                       encoded_client_hello_inner,
-                                       &client_hello)) {
-      OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
       ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
       return ssl_hs_error;
     }
-    hs->ech_client_hello_buf = std::move(client_hello_inner);
 
     // Reparse |client_hello| from the buffer owned by |hs|.
     if (!hs->GetClientHello(&msg, &client_hello)) {
diff --git a/src/tool/server.cc b/src/tool/server.cc
index 18b692d..ebecee0 100644
--- a/src/tool/server.cc
+++ b/src/tool/server.cc
@@ -132,21 +132,37 @@
 
 static bssl::UniquePtr<X509> MakeSelfSignedCert(EVP_PKEY *evp_pkey,
                                                 const int valid_days) {
+  uint64_t serial;
   bssl::UniquePtr<X509> x509(X509_new());
-  uint32_t serial;
-  RAND_bytes(reinterpret_cast<uint8_t*>(&serial), sizeof(serial));
-  ASN1_INTEGER_set(X509_get_serialNumber(x509.get()), serial >> 1);
-  X509_gmtime_adj(X509_get_notBefore(x509.get()), 0);
-  X509_gmtime_adj(X509_get_notAfter(x509.get()), 60 * 60 * 24 * valid_days);
+  if (!x509 ||  //
+      !X509_set_version(x509.get(), X509_VERSION_3) ||
+      !RAND_bytes(reinterpret_cast<uint8_t *>(&serial), sizeof(serial)) ||
+      !ASN1_INTEGER_set_uint64(X509_get_serialNumber(x509.get()), serial) ||
+      !X509_gmtime_adj(X509_get_notBefore(x509.get()), 0) ||
+      !X509_gmtime_adj(X509_get_notAfter(x509.get()),
+                       60 * 60 * 24 * valid_days)) {
+    return nullptr;
+  }
 
-  X509_NAME* subject = X509_get_subject_name(x509.get());
-  X509_NAME_add_entry_by_txt(subject, "C", MBSTRING_ASC,
-                             reinterpret_cast<const uint8_t *>("US"), -1, -1,
-                             0);
-  X509_NAME_add_entry_by_txt(subject, "O", MBSTRING_ASC,
-                             reinterpret_cast<const uint8_t *>("BoringSSL"), -1,
-                             -1, 0);
-  X509_set_issuer_name(x509.get(), subject);
+  X509_NAME *subject = X509_get_subject_name(x509.get());
+  if (!X509_NAME_add_entry_by_txt(subject, "C", MBSTRING_ASC,
+                                  reinterpret_cast<const uint8_t *>("US"), -1,
+                                  -1, 0) ||
+      !X509_NAME_add_entry_by_txt(
+          subject, "O", MBSTRING_ASC,
+          reinterpret_cast<const uint8_t *>("BoringSSL"), -1, -1, 0) ||
+      !X509_set_issuer_name(x509.get(), subject)) {
+    return nullptr;
+  }
+
+  // macOS requires an explicit EKU extension.
+  bssl::UniquePtr<STACK_OF(ASN1_OBJECT)> ekus(sk_ASN1_OBJECT_new_null());
+  if (!ekus ||
+      !sk_ASN1_OBJECT_push(ekus.get(), OBJ_nid2obj(NID_server_auth)) ||
+      !X509_add1_ext_i2d(x509.get(), NID_ext_key_usage, ekus.get(), /*crit=*/1,
+                         /*flags=*/0)) {
+    return nullptr;
+  }
 
   if (!X509_set_pubkey(x509.get(), evp_pkey)) {
     fprintf(stderr, "Failed to set public key.\n");
diff --git a/src/util/BUILD.toplevel b/src/util/BUILD.toplevel
index 462a24f..cfa695a 100644
--- a/src/util/BUILD.toplevel
+++ b/src/util/BUILD.toplevel
@@ -18,10 +18,11 @@
     "crypto_headers",
     "crypto_internal_headers",
     "crypto_sources",
+    "crypto_sources_apple_aarch64",
+    "crypto_sources_apple_x86_64",
     "crypto_sources_linux_aarch64",
     "crypto_sources_linux_ppc64le",
     "crypto_sources_linux_x86_64",
-    "crypto_sources_mac_x86_64",
     "fips_fragments",
     "ssl_headers",
     "ssl_internal_headers",
@@ -36,52 +37,42 @@
 
 config_setting(
     name = "linux_aarch64",
-    values = {"cpu": "aarch64"},
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:aarch64",
+    ],
 )
 
 config_setting(
     name = "linux_x86_64",
-    values = {"cpu": "k8"},
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:x86_64",
+    ],
 )
 
 config_setting(
     name = "linux_ppc64le",
-    values = {"cpu": "ppc"},
+    constraint_values = [
+        "@platforms//os:linux",
+        "@platforms//cpu:ppc",
+    ],
 )
 
 config_setting(
-    name = "mac_x86_64",
-    values = {"cpu": "darwin"},
+    name = "macos_aarch64",
+    constraint_values = [
+        "@platforms//os:macos",
+        "@platforms//cpu:aarch64",
+    ],
 )
 
 config_setting(
-    name = "windows_x86_64",
-    values = {"cpu": "x64_windows"},
-)
-
-config_setting(
-    name = "android_legacy",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
-
-config_setting(
-    name = "android_stlport",
-    values = {"crosstool_top": "@androidndk//:toolchain-stlport"},
-)
-
-config_setting(
-    name = "android_libcpp",
-    values = {"crosstool_top": "@androidndk//:toolchain-libcpp"},
-)
-
-config_setting(
-    name = "android_gnu_libstdcpp",
-    values = {"crosstool_top": "@androidndk//:toolchain-gnu-libstdcpp"},
-)
-
-config_setting(
-    name = "android_default",
-    values = {"crosstool_top": "@androidndk//:default_crosstool"},
+    name = "macos_x86_64",
+    constraint_values = [
+        "@platforms//os:macos",
+        "@platforms//cpu:x86_64",
+    ],
 )
 
 posix_copts = [
@@ -98,11 +89,6 @@
     "-Wwrite-strings",
     "-Wshadow",
     "-fno-common",
-
-    # Modern build environments should be able to set this to use atomic
-    # operations for reference counting rather than locks. However, it's
-    # known not to work on some Android builds.
-    # "-DOPENSSL_C11_ATOMIC",
 ]
 
 linux_copts = posix_copts + [
@@ -113,24 +99,29 @@
 ]
 
 boringssl_copts = select({
-    ":linux_aarch64": linux_copts,
-    ":linux_ppc64le": linux_copts,
-    ":linux_x86_64": linux_copts,
-    ":mac_x86_64": posix_copts,
-    ":windows_x86_64": [
-        "-DWIN32_LEAN_AND_MEAN",
-        "-DOPENSSL_NO_ASM",
-    ],
-    "//conditions:default": ["-DOPENSSL_NO_ASM"],
+    "@platforms//os:linux": linux_copts,
+    "@platforms//os:macos": posix_copts,
+    "@platforms//os:windows": ["-DWIN32_LEAN_AND_MEAN"],
+    "//conditions:default": [],
 })
 
+# These selects must be kept in sync.
 crypto_sources_asm = select({
     ":linux_aarch64": crypto_sources_linux_aarch64,
     ":linux_ppc64le": crypto_sources_linux_ppc64le,
     ":linux_x86_64": crypto_sources_linux_x86_64,
-    ":mac_x86_64": crypto_sources_mac_x86_64,
+    ":macos_aarch64": crypto_sources_apple_aarch64,
+    ":macos_x86_64": crypto_sources_apple_x86_64,
     "//conditions:default": [],
 })
+boringssl_copts += select({
+    ":linux_aarch64": [],
+    ":linux_ppc64le": [],
+    ":linux_x86_64": [],
+    ":macos_aarch64": [],
+    ":macos_x86_64": [],
+    "//conditions:default": ["-DOPENSSL_NO_ASM"],
+})
 
 # For C targets only (not C++), compile with C11 support.
 posix_copts_c11 = [
@@ -141,10 +132,8 @@
 ]
 
 boringssl_copts_c11 = boringssl_copts + select({
-    ":linux_aarch64": posix_copts_c11,
-    ":linux_ppc64le": posix_copts_c11,
-    ":linux_x86_64": posix_copts_c11,
-    ":mac_x86_64": posix_copts_c11,
+    "@platforms//os:linux": posix_copts_c11,
+    "@platforms//os:macos": posix_copts_c11,
     "//conditions:default": [],
 })
 
@@ -155,10 +144,8 @@
 ]
 
 boringssl_copts_cxx = boringssl_copts + select({
-    ":linux_aarch64": posix_copts_cxx,
-    ":linux_ppc64le": posix_copts_cxx,
-    ":linux_x86_64": posix_copts_cxx,
-    ":mac_x86_64": posix_copts_cxx,
+    "@platforms//os:linux": posix_copts_cxx,
+    "@platforms//os:macos": posix_copts_cxx,
     "//conditions:default": [],
 })
 
@@ -171,13 +158,9 @@
     linkopts = select({
         # Android supports pthreads, but does not provide a libpthread
         # to link against.
-        ":android_legacy": [],
-        ":android_stlport": [],
-        ":android_libcpp": [],
-        ":android_gnu_libstdcpp": [],
-        ":android_default": [],
-        ":mac_x86_64": [],
-        ":windows_x86_64": ["-defaultlib:advapi32.lib"],
+        "@platforms//os:android": [],
+        "@platforms//os:macos": [],
+        "@platforms//os:windows": ["-defaultlib:advapi32.lib"],
         "//conditions:default": ["-lpthread"],
     }),
     visibility = ["//visibility:public"],
diff --git a/src/util/bot/DEPS b/src/util/bot/DEPS
index e3c95f3..574d94b 100644
--- a/src/util/bot/DEPS
+++ b/src/util/bot/DEPS
@@ -187,7 +187,7 @@
     'action': [ 'download_from_google_storage',
                 '--no_resume',
                 '--bucket', 'chrome-boringssl-sde',
-                '-s', 'boringssl/util/bot/sde-linux64.tar.bz2.sha1'
+                '-s', 'boringssl/util/bot/sde-linux64.tar.xz.sha1'
     ],
   },
   {
@@ -196,7 +196,7 @@
     'condition': 'checkout_sde and host_os == "linux"',
     'action': [ 'python3',
                 'boringssl/util/bot/extract.py',
-                'boringssl/util/bot/sde-linux64.tar.bz2',
+                'boringssl/util/bot/sde-linux64.tar.xz',
                 'boringssl/util/bot/sde-linux64/',
     ],
   },
@@ -207,7 +207,7 @@
     'action': [ 'download_from_google_storage',
                 '--no_resume',
                 '--bucket', 'chrome-boringssl-sde',
-                '-s', 'boringssl/util/bot/sde-win32.tar.bz2.sha1'
+                '-s', 'boringssl/util/bot/sde-win32.tar.xz.sha1'
     ],
   },
   {
@@ -216,7 +216,7 @@
     'condition': 'checkout_sde and host_os == "win"',
     'action': [ 'python3',
                 'boringssl/util/bot/extract.py',
-                'boringssl/util/bot/sde-win32.tar.bz2',
+                'boringssl/util/bot/sde-win32.tar.xz',
                 'boringssl/util/bot/sde-win32/',
     ],
   },
diff --git a/src/util/bot/UPDATING b/src/util/bot/UPDATING
index 2e6b914..dad6192 100644
--- a/src/util/bot/UPDATING
+++ b/src/util/bot/UPDATING
@@ -46,13 +46,13 @@
 
     The current revision is strawberry-perl-5.26.2.1-64bit-portable.zip.
 
-Finally, update sde-linux64.tar.bz2 and sde-win32.tar.bz2 by downloading the
+Finally, update sde-linux64.tar.xz and sde-win32.tar.xz by downloading the
 latet release from Intel at
 https://software.intel.com/en-us/articles/intel-software-development-emulator,
 but upload it with the following command. (Note the bucket is different.)
 
-    upload_to_google_storage.py -b chrome-boringssl-sde sde-linux64.tar.bz2 sde-win32.tar.bz2
+    upload_to_google_storage.py -b chrome-boringssl-sde sde-linux64.tar.xz sde-win32.tar.xz
 
-The current revision is sde-external-8.50.0-2020-03-26-*.tar.bz2.
+The current revision is sde-external-9.0.0-2021-11-07-*.tar.xz.
 
 When adding new files, remember to update .gitignore.
diff --git a/src/util/bot/extract.py b/src/util/bot/extract.py
index 9b1b88a..4ef5f65 100644
--- a/src/util/bot/extract.py
+++ b/src/util/bot/extract.py
@@ -118,6 +118,8 @@
     entries = IterateTar(archive, 'gz')
   elif archive.endswith('.tar.bz2'):
     entries = IterateTar(archive, 'bz2')
+  elif archive.endswith('.tar.xz'):
+    entries = IterateTar(archive, 'xz')
   else:
     raise ValueError(archive)
 
diff --git a/src/util/bot/sde-linux64.tar.bz2.sha1 b/src/util/bot/sde-linux64.tar.bz2.sha1
deleted file mode 100644
index c450f63..0000000
--- a/src/util/bot/sde-linux64.tar.bz2.sha1
+++ /dev/null
@@ -1 +0,0 @@
-baacb5a29755e299d3384c41c6dd55f65235ef1f
\ No newline at end of file
diff --git a/src/util/bot/sde-linux64.tar.xz.sha1 b/src/util/bot/sde-linux64.tar.xz.sha1
new file mode 100644
index 0000000..f9ee198
--- /dev/null
+++ b/src/util/bot/sde-linux64.tar.xz.sha1
@@ -0,0 +1 @@
+8bba6e01a47b2cfd9e7429f77256db540031ff43
\ No newline at end of file
diff --git a/src/util/bot/sde-win32.tar.bz2.sha1 b/src/util/bot/sde-win32.tar.bz2.sha1
deleted file mode 100644
index b960747..0000000
--- a/src/util/bot/sde-win32.tar.bz2.sha1
+++ /dev/null
@@ -1 +0,0 @@
-cc2d77ff4a221165a8bb13f43ccfbff6550b90c8
\ No newline at end of file
diff --git a/src/util/bot/sde-win32.tar.xz.sha1 b/src/util/bot/sde-win32.tar.xz.sha1
new file mode 100644
index 0000000..dbaf87f
--- /dev/null
+++ b/src/util/bot/sde-win32.tar.xz.sha1
@@ -0,0 +1 @@
+59ef225031e14e5ac257ada61d416f6ea0c9c080
\ No newline at end of file
diff --git a/src/util/doc.go b/src/util/doc.go
index a38e078..651998e 100644
--- a/src/util/doc.go
+++ b/src/util/doc.go
@@ -503,7 +503,7 @@
 
 // markupPipeWords converts |s| into an HTML string, safe to be included outside
 // a tag, while also marking up words surrounded by |.
-func markupPipeWords(allDecls map[string]string, s string) template.HTML {
+func markupPipeWords(allDecls map[string]string, s string, linkDecls bool) template.HTML {
 	// It is safe to look for '|' in the HTML-escaped version of |s|
 	// below. The escaped version cannot include '|' instead tags because
 	// there are no tags by construction.
@@ -524,12 +524,10 @@
 		if i > 0 && (j == -1 || j > i) {
 			ret += "<tt>"
 			anchor, isLink := allDecls[s[:i]]
-			if isLink {
-				ret += fmt.Sprintf("<a href=\"%s\">", template.HTMLEscapeString(anchor))
-			}
-			ret += s[:i]
-			if isLink {
-				ret += "</a>"
+			if linkDecls && isLink {
+				ret += fmt.Sprintf("<a href=\"%s\">%s</a>", template.HTMLEscapeString(anchor), s[:i])
+			} else {
+				ret += s[:i]
 			}
 			ret += "</tt>"
 			s = s[i+1:]
@@ -602,11 +600,12 @@
 
 	headerTmpl := template.New("headerTmpl")
 	headerTmpl.Funcs(template.FuncMap{
-		"firstSentence":   firstSentence,
-		"markupPipeWords": func(s string) template.HTML { return markupPipeWords(allDecls, s) },
-		"markupFirstWord": markupFirstWord,
-		"markupRFC":       markupRFC,
-		"newlinesToBR":    newlinesToBR,
+		"firstSentence":         firstSentence,
+		"markupPipeWords":       func(s string) template.HTML { return markupPipeWords(allDecls, s, true /* linkDecls */) },
+		"markupPipeWordsNoLink": func(s string) template.HTML { return markupPipeWords(allDecls, s, false /* linkDecls */) },
+		"markupFirstWord":       markupFirstWord,
+		"markupRFC":             markupRFC,
+		"newlinesToBR":          newlinesToBR,
 	})
 	headerTmpl, err := headerTmpl.Parse(`<!DOCTYPE html>
 <html>
@@ -623,12 +622,12 @@
       <a href="headers.html">All headers</a>
     </div>
 
-    {{range .Preamble}}<p>{{. | markupPipeWords}}</p>{{end}}
+    {{range .Preamble}}<p>{{. | markupPipeWords | markupRFC}}</p>{{end}}
 
     <ol>
       {{range .Sections}}
         {{if not .IsPrivate}}
-          {{if .Anchor}}<li class="header"><a href="#{{.Anchor}}">{{.Preamble | firstSentence | markupPipeWords}}</a></li>{{end}}
+          {{if .Anchor}}<li class="header"><a href="#{{.Anchor}}">{{.Preamble | firstSentence | markupPipeWordsNoLink}}</a></li>{{end}}
           {{range .Decls}}
             {{if .Anchor}}<li><a href="#{{.Anchor}}"><tt>{{.Name}}</tt></a></li>{{end}}
           {{end}}
@@ -641,7 +640,7 @@
         <div class="section" {{if .Anchor}}id="{{.Anchor}}"{{end}}>
         {{if .Preamble}}
           <div class="sectionpreamble">
-          {{range .Preamble}}<p>{{. | markupPipeWords}}</p>{{end}}
+          {{range .Preamble}}<p>{{. | markupPipeWords | markupRFC}}</p>{{end}}
           </div>
         {{end}}
 
diff --git a/src/util/fipstools/CMakeLists.txt b/src/util/fipstools/CMakeLists.txt
new file mode 100644
index 0000000..6359383
--- /dev/null
+++ b/src/util/fipstools/CMakeLists.txt
@@ -0,0 +1,12 @@
+include_directories(../../include)
+
+if(FIPS)
+  add_executable(
+    test_fips
+
+    test_fips.c
+  )
+
+  add_dependencies(test_fips global_target)
+  target_link_libraries(test_fips crypto)
+endif()
diff --git a/src/util/fipstools/acvp/acvptool/acvp/acvp.go b/src/util/fipstools/acvp/acvptool/acvp/acvp.go
index 04f0932..9419508 100644
--- a/src/util/fipstools/acvp/acvptool/acvp/acvp.go
+++ b/src/util/fipstools/acvp/acvptool/acvp/acvp.go
@@ -33,6 +33,8 @@
 	"time"
 )
 
+const loginEndpoint = "acvp/v1/login"
+
 // Server represents an ACVP server.
 type Server struct {
 	// PrefixTokens are access tokens that apply to URLs under a certain prefix.
@@ -239,7 +241,7 @@
 	if json.Unmarshal(jsonBytes, &token) != nil {
 		return false
 	}
-	return token.Expiry > 0 && token.Expiry < uint64(time.Now().Unix())
+	return token.Expiry > 0 && token.Expiry < uint64(time.Now().Add(-10*time.Second).Unix())
 }
 
 func (server *Server) getToken(endPoint string) (string, error) {
@@ -255,7 +257,7 @@
 		var reply struct {
 			AccessToken string `json:"accessToken"`
 		}
-		if err := server.postMessage(&reply, "acvp/v1/login", map[string]string{
+		if err := server.postMessage(&reply, loginEndpoint, map[string]string{
 			"password":    server.totpFunc(),
 			"accessToken": token,
 		}); err != nil {
@@ -278,7 +280,7 @@
 		SizeLimit             int64  `json:"sizeConstraint"`
 	}
 
-	if err := server.postMessage(&reply, "acvp/v1/login", map[string]string{"password": server.totpFunc()}); err != nil {
+	if err := server.postMessage(&reply, loginEndpoint, map[string]string{"password": server.totpFunc()}); err != nil {
 		return err
 	}
 
@@ -372,7 +374,7 @@
 	if err != nil {
 		return nil, err
 	}
-	if len(token) != 0 {
+	if len(token) != 0 && endpoint != loginEndpoint {
 		req.Header.Add("Authorization", "Bearer "+token)
 	}
 	return req, nil
diff --git a/src/util/fipstools/break-kat.go b/src/util/fipstools/break-kat.go
new file mode 100644
index 0000000..b500545
--- /dev/null
+++ b/src/util/fipstools/break-kat.go
@@ -0,0 +1,89 @@
+// break-kat corrupts a known-answer-test input in a binary and writes the
+// corrupted binary to stdout. This is used to demonstrate that the KATs in the
+// binary notice the error.
+package main
+
+import (
+	"bytes"
+	"encoding/hex"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"sort"
+)
+
+var (
+	kats = map[string]string{
+		"HMAC-SHA-256":    "dad91293dfcf2a7c8ecd13fe353fa75b",
+		"AES-CBC-encrypt": "078609a6c5ac2544699adf682fa377f9be8ab6aef563e8c56a36b84f557fadd3",
+		"AES-CBC-decrypt": "347aa5a024b28257b36510be583d4f47adb7bbeedc6005bbbd0d0a9f06bb7b10",
+		"AES-GCM-encrypt": "8fcc4099808e75caaff582898848a88d808b55ab4e9370797d940be8cc1d7884",
+		"AES-GCM-decrypt": "35f3058f875760ff09d3120f70c4bc9ed7a86872e13452202176f7371ae04faae1dd391920f5d13953d896785994823c",
+		"DRBG":            "c4da0740d505f1ee280b95e58c4931ac6de846a0152fbb4a3f174cf4787a4f1a40c2b50babe14aae530be5886d910a27",
+		"DRBG-reseed":     "c7161ca36c2309b716e9859bb96c6d49bdc8352103a18cd24ef42ec97ef46bf446eb1a4576c186e9351803763a7912fe",
+		"SHA-1":           "132fd9bad5c1826263bafbb699f707a5",
+		"SHA-256":         "ff3b857da7236a2baa0f396b51522217",
+		"SHA-512":         "212512f8d2ad8322781c6c4d69a9daa1",
+		"TLS-KDF":         "abc3657b094c7628a0b282996fe75a75f4984fd94d4ecc2fcf53a2c469a3f731",
+		"RSA-sign":        "d2b56e53306f720d7929d8708bf46f1c22300305582b115bedcac722d8aa5ab2",
+		"RSA-verify":      "abe2cbc13d6bd39d48db5334ddbf8d070a93bdcb104e2cc5d0ee486ee295f6b31bda126c41890b98b73e70e6b65d82f95c663121755a90744c8d1c21148a1960be0eca446e9ff497f1345c537ef8119b9a4398e95c5c6de2b1c955905c5299d8ce7a3b6ab76380d9babdd15f610237e1f3f2aa1c1f1e770b62fbb596381b2ebdd77ecef9c90d4c92f7b6b05fed2936285fa94826e62055322a33b6f04c74ce69e5d8d737fb838b79d2d48e3daf71387531882531a95ac964d02ea413bf85952982bbc089527daff5b845c9a0f4d14ef1956d9c3acae882d12da66da0f35794f5ee32232333517db9315232a183b991654dbea41615345c885325926744a53915",
+		"ECDSA-sign":      "1e35930be860d0942ca7bbd6f6ded87f157e4de24f81ed4b875c0e018e89a81f",
+		"ECDSA-verify":    "6780c5fc70275e2c7061a0e7877bb174deadeb9887027f3fa83654158ba7f50c2d36e5799790bfbe2183d33e96f3c51f6a232f2a24488c8e5f64c37ea2cf0529",
+		"Z-computation":   "e7604491269afb5b102d6ea52cb59feb70aede6ce3bfb3e0105485abd861d77b",
+		"FFDH":            "a14f8ad36be37b18b8f35864392f150ab7ee22c47e1870052a3f17918274af18aaeaf4cf6aacfde96c9d586eb7ebaff6b03fe3b79a8e2ff9dd6df34caaf2ac70fd3771d026b41a561ee90e4337d0575f8a0bd160c868e7e3cef88aa1d88448b1e4742ba11480a9f8a8b737347c408d74a7d57598c48875629df0c85327a124ddec1ad50cd597a985588434ce19c6f044a1696b5f244b899b7e77d4f6f20213ae8eb15d37eb8e67e6c8bdbc4fd6e17426283da96f23a897b210058c7c70fb126a5bf606dbeb1a6d5cca04184c4e95c2e8a70f50f5c1eabd066bd79c180456316ac02d366eb3b0e7ba82fb70dcbd737ca55734579dd250fffa8e0584be99d32b35",
+	}
+
+	listTests = flag.Bool("list-tests", false, "List known test values and exit")
+)
+
+func main() {
+	flag.Parse()
+
+	if *listTests {
+		for _, kat := range sortedKATs() {
+			fmt.Println(kat)
+		}
+		os.Exit(0)
+	}
+
+	if flag.NArg() != 2 || kats[flag.Arg(1)] == "" {
+		fmt.Fprintln(os.Stderr, "Usage: break-kat <binary path> <test to break> > output")
+		fmt.Fprintln(os.Stderr, "Possible values for <test to break>:")
+		for _, kat := range sortedKATs() {
+			fmt.Fprintln(os.Stderr, " ", kat)
+		}
+		os.Exit(1)
+	}
+
+	inPath := flag.Arg(0)
+	test := flag.Arg(1)
+	testInputValue, err := hex.DecodeString(kats[test])
+	if err != nil {
+		panic("invalid kat data: " + err.Error())
+	}
+
+	binaryContents, err := ioutil.ReadFile(inPath)
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	i := bytes.Index(binaryContents, testInputValue)
+	if i < 0 {
+		fmt.Fprintln(os.Stderr, "Expected test input value was not found in binary.")
+		os.Exit(3)
+	}
+
+	binaryContents[i] ^= 1
+	os.Stdout.Write(binaryContents)
+}
+
+func sortedKATs() []string {
+	var ret []string
+	for kat := range kats {
+		ret = append(ret, kat)
+	}
+	sort.Strings(ret)
+	return ret
+}
diff --git a/src/util/fipstools/break-tests-android.sh b/src/util/fipstools/break-tests-android.sh
deleted file mode 100644
index efb166e..0000000
--- a/src/util/fipstools/break-tests-android.sh
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2019, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-# This script exists to exercise breaking each of the FIPS tests on an Android
-# device. Since, on Android, BoringCrypto exists in both 32- and 64-bit
-# versions, the first argument must be either "32" or "64" to select which is
-# being tested. The Android source tree must have been setup (with "lunch") for
-# a matching build configuration before using this script to build the
-# binaries. (Although it'll fail non-silently if there's a mismatch.)
-#
-# Since each test needs the FIPS module to be compiled differently, and that
-# can take a long time, this script is run twice: once with "build" as the
-# second argument to run the builds, and then with "run" as the second argument
-# to run each test.
-#
-# Run it with /bin/bash, not /bin/sh, otherwise "read" may fail.
-#
-# In order to reconfigure the build for each test, it needs to set a define. It
-# does so by rewriting a template in external/boringssl/Android.bp and you must
-# add the template value before doing the builds. To do so, insert
-# -DBORINGSSL_FIPS_BREAK_XXX=1 in the cflags list for the module, probably by
-# putting it in the "boringssl_flags" stanza.
-
-set -x
-set -e
-
-if [ ! -f external/boringssl/Android.bp ]; then
-  echo "Must be run from the top-level of an Android source tree."
-  exit 1
-fi
-
-. build/envsetup.sh
-
-TESTS="NONE ECDSA_PWCT CRNG RSA_PWCT AES_CBC AES_GCM DES SHA_1 SHA_256 SHA_512 RSA_SIG DRBG ECDSA_SIG Z_COMPUTATION TLS_KDF FFC_DH"
-
-if [ "x$1" = "x32" ]; then
-  lib="lib"
-  bits="32"
-elif [ "x$1" = "x64" ] ; then
-  lib="lib64"
-  bits="64"
-else
-  echo "First argument must be 32 or 64"
-  exit 1
-fi
-
-if [ "x$2" = "xbuild" ]; then
-  if ! grep -q DBORINGSSL_FIPS_BREAK_XXX=1 external/boringssl/Android.bp; then
-    echo "Missing DBORINGSSL_FIPS_BREAK_XXX in external/boringssl/Android.bp. Edit the file and insert -DBORINGSSL_FIPS_BREAK_XXX=1 in the cflags for the FIPS module"
-    exit 1
-  fi
-
-  printf "\\x1b[1mBuilding modules\\x1b[0m\n"
-  for test in $TESTS; do
-    printf "\\x1b[1mBuilding for ${test}\\x1b[0m\n"
-    cp external/boringssl/Android.bp external/boringssl/Android.bp.orig
-    sed -i -e "s/DBORINGSSL_FIPS_BREAK_XXX/DBORINGSSL_FIPS_BREAK_${test}/" external/boringssl/Android.bp
-    m test_fips
-    dir=test-${bits}-${test}
-    rm -Rf $dir
-    mkdir $dir
-    cp ${ANDROID_PRODUCT_OUT}/system/${lib}/libcrypto.so $dir
-    cp ${ANDROID_PRODUCT_OUT}/system/bin/test_fips $dir
-    if [ $bits = "32" ] ; then
-      if ! file ${dir}/test_fips | grep -q "32-bit" ; then
-        echo "32-bit build requested but binaries don't appear to be 32-bit:"
-        file ${dir}/test_fips
-        exit 1
-      fi
-    else
-      if ! file ${dir}/test_fips | grep -q "64-bit" ; then
-        echo "64-bit build requested but binaries don't appear to be 64-bit:"
-        file ${dir}/test_fips
-        exit 1
-      fi
-    fi
-    cp external/boringssl/Android.bp.orig external/boringssl/Android.bp
-  done
-elif [ "x$2" = "xrun" ]; then
-  printf "\\x1b[1mTesting\\x1b[0m\n"
-  for test in $TESTS; do
-    dir=test-${bits}-${test}
-    if [ ! '(' -d ${dir} -a -f ${dir}/test_fips -a -f ${dir}/libcrypto.so ')' ] ; then
-      echo "Build directory ${dir} is missing or is missing files"
-      exit 1
-    fi
-    adb push ${dir}/* /data/local/tmp
-    printf "\\x1b[1mTesting ${test}\\x1b[0m\n"
-    adb shell -n -t -x LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/test_fips
-    read
-  done
-
-  printf "\\x1b[1mTesting integrity}\\x1b[0m\n"
-  src=test-${bits}-NONE
-  dir=test-${bits}-INT
-  rm -Rf $dir
-  mkdir $dir
-  go run external/boringssl/src/util/fipstools/break-hash.go ${src}/libcrypto.so ${dir}/libcrypto.so
-  cp ${src}/test_fips $dir
-  adb push ${dir}/* /data/local/tmp
-  adb shell -n -t -x LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/test_fips
-  read
-else
-  echo "Second argument must be build or run"
-  exit 1
-fi
diff --git a/src/util/fipstools/break-tests.sh b/src/util/fipstools/break-tests.sh
deleted file mode 100644
index 84c24ee..0000000
--- a/src/util/fipstools/break-tests.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2018, Google Inc.
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
-# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
-# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
-# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-# This script exists to exercise breaking each of the FIPS tests. It builds
-# BoringSSL differently for each test and that can take a long time. Thus it's
-# run twice: once, from a BoringSSL source tree, with "build" as the sole
-# argument to run the builds, and then (from the same location) with no
-# arguments to run each script.
-#
-# Run it with /bin/bash, not /bin/sh, otherwise "read" may fail.
-
-set -x
-
-TESTS="NONE ECDSA_PWCT CRNG RSA_PWCT AES_CBC AES_GCM DES SHA_1 SHA_256 SHA_512 RSA_SIG DRBG ECDSA_SIG Z_COMPUTATION TLS_KDF FFC_DH"
-
-if [ "x$1" = "xbuild" ]; then
-	for test in $TESTS; do
-		rm -Rf build-$test
-		mkdir build-$test
-		pushd build-$test
-		cmake -GNinja -DCMAKE_TOOLCHAIN_FILE=${HOME}/toolchain -DFIPS=1 -DFIPS_BREAK_TEST=${test} -DCMAKE_BUILD_TYPE=Release ..
-		ninja test_fips
-		popd
-	done
-
-	exit 0
-fi
-
-for test in $TESTS; do
-	pushd build-$test
-	printf "\n\n\\x1b[1m$test\\x1b[0m\n"
-	./util/fipstools/cavp/test_fips
-	echo "Waiting for keypress..."
-	read
-	popd
-done
-
-pushd build-NONE
-printf "\\x1b[1mIntegrity\\x1b[0m\n"
-go run ../util/fipstools/break-hash.go ./util/fipstools/cavp/test_fips ./util/fipstools/cavp/test_fips_broken
-./util/fipstools/cavp/test_fips_broken
-popd
diff --git a/src/util/fipstools/cavp/CMakeLists.txt b/src/util/fipstools/cavp/CMakeLists.txt
deleted file mode 100644
index a50c9ab..0000000
--- a/src/util/fipstools/cavp/CMakeLists.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-include_directories(../../../include)
-
-if(FIPS)
-  add_executable(
-    cavp
-
-    cavp_main.cc
-
-    cavp_aes_gcm_test.cc
-    cavp_aes_test.cc
-    cavp_ctr_drbg_test.cc
-    cavp_ecdsa2_keypair_test.cc
-    cavp_ecdsa2_pkv_test.cc
-    cavp_ecdsa2_siggen_test.cc
-    cavp_ecdsa2_sigver_test.cc
-    cavp_hmac_test.cc
-    cavp_kas_test.cc
-    cavp_keywrap_test.cc
-    cavp_rsa2_keygen_test.cc
-    cavp_rsa2_siggen_test.cc
-    cavp_rsa2_sigver_test.cc
-    cavp_sha_monte_test.cc
-    cavp_sha_test.cc
-    cavp_tdes_test.cc
-    cavp_tlskdf_test.cc
-
-    cavp_test_util.cc
-  )
-
-  add_dependencies(cavp global_target)
-
-  add_executable(
-    test_fips
-
-    test_fips.c
-  )
-
-  add_dependencies(test_fips global_target)
-
-  target_link_libraries(cavp test_support_lib crypto)
-  target_link_libraries(test_fips test_support_lib crypto)
-endif()
diff --git a/src/util/fipstools/cavp/cavp_aes_gcm_test.cc b/src/util/fipstools/cavp/cavp_aes_gcm_test.cc
deleted file mode 100644
index 6ee991d..0000000
--- a/src/util/fipstools/cavp/cavp_aes_gcm_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_aes_gcm_test processes a NIST CAVP AES GCM test vector request file and
-// emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/aead.h>
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  const EVP_AEAD *aead;
-};
-
-}
-
-static const EVP_AEAD *GetAEAD(const std::string &name, const bool enc) {
-  if (name == "aes-128-gcm") {
-    return EVP_aead_aes_128_gcm();
-  } else if (name == "aes-192-gcm") {
-    return EVP_aead_aes_192_gcm();
-  } else if (name == "aes-256-gcm") {
-    return EVP_aead_aes_256_gcm();
-  }
-  return nullptr;
-}
-
-static bool TestAEADEncrypt(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  std::string key_len_str, iv_len_str, pt_len_str, aad_len_str, tag_len_str;
-  if (!t->GetInstruction(&key_len_str, "Keylen") ||
-      !t->GetInstruction(&iv_len_str, "IVlen") ||
-      !t->GetInstruction(&pt_len_str, "PTlen") ||
-      !t->GetInstruction(&aad_len_str, "AADlen") ||
-      !t->GetInstruction(&tag_len_str, "Taglen")) {
-    return false;
-  }
-
-  std::string count;
-  std::vector<uint8_t> key, iv, pt, aad, tag, ct;
-  if (!t->GetAttribute(&count, "Count") ||
-      !t->GetBytes(&key, "Key") ||
-      !t->GetBytes(&iv, "IV") ||
-      !t->GetBytes(&pt, "PT") ||
-      !t->GetBytes(&aad, "AAD") ||
-      key.size() * 8 != strtoul(key_len_str.c_str(), nullptr, 0) ||
-      iv.size() * 8 != strtoul(iv_len_str.c_str(), nullptr, 0) ||
-      pt.size() * 8 != strtoul(pt_len_str.c_str(), nullptr, 0) ||
-      aad.size() * 8 != strtoul(aad_len_str.c_str(), nullptr, 0) ||
-      iv.size() != 12) {
-    return false;
-  }
-
-  const size_t tag_len = strtoul(tag_len_str.c_str(), nullptr, 0) / 8;
-  if (!AEADEncrypt(ctx->aead, &ct, &tag, tag_len, key, pt, aad, iv)) {
-    return false;
-  }
-  printf("%s", t->CurrentTestToString().c_str());
-  printf("CT = %s\r\n", EncodeHex(ct).c_str());
-  printf("Tag = %s\r\n\r\n", EncodeHex(tag).c_str());
-
-  return true;
-}
-
-static bool TestAEADDecrypt(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  std::string key_len, iv_len, pt_len_str, aad_len_str, tag_len;
-  if (!t->GetInstruction(&key_len, "Keylen") ||
-      !t->GetInstruction(&iv_len, "IVlen") ||
-      !t->GetInstruction(&pt_len_str, "PTlen") ||
-      !t->GetInstruction(&aad_len_str, "AADlen") ||
-      !t->GetInstruction(&tag_len, "Taglen")) {
-    t->PrintLine("Invalid instruction block.");
-    return false;
-  }
-  size_t aad_len = strtoul(aad_len_str.c_str(), nullptr, 0) / 8;
-  size_t pt_len = strtoul(pt_len_str.c_str(), nullptr, 0) / 8;
-
-  std::string count;
-  std::vector<uint8_t> key, iv, ct, aad, tag, pt;
-  if (!t->GetAttribute(&count, "Count") ||
-      !t->GetBytes(&key, "Key") ||
-      !t->GetBytes(&aad, "AAD") ||
-      !t->GetBytes(&tag, "Tag") ||
-      !t->GetBytes(&iv, "IV") ||
-      !t->GetBytes(&ct, "CT") ||
-      key.size() * 8 != strtoul(key_len.c_str(), nullptr, 0) ||
-      iv.size() * 8 != strtoul(iv_len.c_str(), nullptr, 0) ||
-      ct.size() != pt_len ||
-      aad.size() != aad_len ||
-      tag.size() * 8 != strtoul(tag_len.c_str(), nullptr, 0)) {
-    t->PrintLine("Invalid test case");
-    return false;
-  }
-
-  printf("%s", t->CurrentTestToString().c_str());
-  bool aead_result =
-      AEADDecrypt(ctx->aead, &pt, pt_len, key, aad, ct, tag, iv);
-  if (aead_result) {
-    printf("PT = %s\r\n\r\n", EncodeHex(pt).c_str());
-  } else {
-    printf("FAIL\r\n\r\n");
-  }
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s (enc|dec) <cipher> <test file>\n", arg);
-  return 1;
-}
-
-int cavp_aes_gcm_test_main(int argc, char **argv) {
-  if (argc != 4) {
-    return usage(argv[0]);
-  }
-
-  const std::string mode(argv[1]);
-  bool (*test_fn)(FileTest * t, void *arg);
-  if (mode == "enc") {
-    test_fn = &TestAEADEncrypt;
-  } else if (mode == "dec") {
-    test_fn = &TestAEADDecrypt;
-  } else {
-    return usage(argv[0]);
-  }
-
-  const EVP_AEAD *aead = GetAEAD(argv[2], mode == "enc");
-  if (aead == nullptr) {
-    fprintf(stderr, "invalid aead: %s\n", argv[2]);
-    return 1;
-  }
-
-  TestCtx ctx = {aead};
-
-  FileTest::Options opts;
-  opts.path = argv[3];
-  opts.callback = test_fn;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_aes_test.cc b/src/util/fipstools/cavp/cavp_aes_test.cc
deleted file mode 100644
index d1f49b4..0000000
--- a/src/util/fipstools/cavp/cavp_aes_test.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_aes_test processes a NIST CAVP AES test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  const EVP_CIPHER *cipher;
-  bool has_iv;
-  enum Mode {
-    kKAT,  // Known Answer Test
-    kMCT,  // Monte Carlo Test
-  };
-  Mode mode;
-};
-
-}
-
-static bool MonteCarlo(const TestCtx *ctx, FileTest *t,
-                       const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
-                       bool encrypt, std::vector<uint8_t> key,
-                       std::vector<uint8_t> iv, std::vector<uint8_t> in) {
-  const std::string in_label = encrypt ? "PLAINTEXT" : "CIPHERTEXT",
-                    result_label = encrypt ? "CIPHERTEXT" : "PLAINTEXT";
-  std::vector<uint8_t> prev_result, result, prev_in;
-  for (int i = 0; i < 100; i++) {
-    printf("COUNT = %d\r\nKEY = %s\r\n", i, EncodeHex(key).c_str());
-    if (ctx->has_iv) {
-      printf("IV = %s\r\n", EncodeHex(iv).c_str());
-    }
-    printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
-
-    if (!ctx->has_iv) {  // ECB mode
-      for (int j = 0; j < 1000; j++) {
-        prev_result = result;
-        if (!CipherOperation(cipher, &result, encrypt, key, iv, in)) {
-          return false;
-        }
-        in = result;
-      }
-    } else {
-      for (int j = 0; j < 1000; j++) {
-        prev_result = result;
-        if (j > 0) {
-          if (encrypt) {
-            iv = result;
-          } else {
-            iv = prev_in;
-          }
-        }
-
-        if (!CipherOperation(cipher, &result, encrypt, key, iv, in)) {
-          return false;
-        }
-
-        prev_in = in;
-
-        if (j == 0) {
-          in = iv;
-        } else {
-          in = prev_result;
-        }
-      }
-    }
-
-    printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
-    const size_t key_len = key.size() * 8;
-    if (key_len == 128) {
-      for (size_t k = 0; k < key.size(); k++) {
-        key[k] ^= result[k];
-      }
-    } else if (key_len == 192) {
-      for (size_t k = 0; k < key.size(); k++) {
-        // Key[i+1] = Key[i] xor (last 64-bits of CT[j-1] || CT[j])
-        if (k < 8) {
-          key[k] ^= prev_result[prev_result.size() - 8 + k];
-        } else {
-          key[k] ^= result[k - 8];
-        }
-      }
-    } else {  // key_len == 256
-      for (size_t k = 0; k < key.size(); k++) {
-        // Key[i+1] = Key[i] xor (CT[j-1] || CT[j])
-        if (k < 16) {
-          key[k] ^= prev_result[k];
-        } else {
-          key[k] ^= result[k - 16];
-        }
-      }
-    }
-
-    if (ctx->has_iv) {
-      iv = result;
-      in = prev_result;
-    } else {
-      in = result;
-    }
-  }
-
-  return true;
-}
-
-static bool TestCipher(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
-    t->PrintLine("Want either ENCRYPT or DECRYPT");
-    return false;
-  }
-  enum {
-    kEncrypt,
-    kDecrypt,
-  } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
-  std::string count;
-  std::vector<uint8_t> key, iv, in, result;
-  if (!t->GetAttribute(&count, "COUNT") ||
-      !t->GetBytes(&key, "KEY") ||
-      (ctx->has_iv && !t->GetBytes(&iv, "IV"))) {
-    return false;
-  }
-
-  const EVP_CIPHER *cipher = ctx->cipher;
-  if (operation == kEncrypt) {
-    if (!t->GetBytes(&in, "PLAINTEXT")) {
-      return false;
-    }
-  } else {  // operation == kDecrypt
-    if (!t->GetBytes(&in, "CIPHERTEXT")) {
-      return false;
-    }
-  }
-
-  if (ctx->mode == TestCtx::kKAT) {
-    if (!CipherOperation(cipher, &result, operation == kEncrypt, key, iv, in)) {
-      return false;
-    }
-    const std::string label =
-        operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
-    printf("%s%s = %s\r\n\r\n", t->CurrentTestToString().c_str(), label.c_str(),
-           EncodeHex(result).c_str());
-  } else {  // ctx->mode == kMCT
-    const std::string op_label =
-        operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
-    printf("%s\r\n\r\n", op_label.c_str());
-    if (!MonteCarlo(ctx, t, cipher, &result, operation == kEncrypt, key, iv,
-                    in)) {
-      return false;
-    }
-    if (operation == kEncrypt) {
-      // MCT tests contain a stray blank line after the ENCRYPT section.
-      printf("\r\n");
-    }
-  }
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s (kat|mct) <cipher> <test file>\n", arg);
-  return 1;
-}
-
-int cavp_aes_test_main(int argc, char **argv) {
-  if (argc != 4) {
-    return usage(argv[0]);
-  }
-
-  const std::string tm(argv[1]);
-  enum TestCtx::Mode test_mode;
-  if (tm == "kat") {
-    test_mode = TestCtx::kKAT;
-  } else if (tm == "mct") {
-    test_mode = TestCtx::kMCT;
-  } else {
-    fprintf(stderr, "invalid test_mode: %s\n", tm.c_str());
-    return usage(argv[0]);
-  }
-
-  const std::string cipher_name(argv[2]);
-  const EVP_CIPHER *cipher = GetCipher(argv[2]);
-  if (cipher == nullptr) {
-    fprintf(stderr, "invalid cipher: %s\n", argv[2]);
-    return 1;
-  }
-  const bool has_iv =
-      (cipher_name != "aes-128-ecb" &&
-       cipher_name != "aes-192-ecb" &&
-       cipher_name != "aes-256-ecb");
-
-  TestCtx ctx = {cipher, has_iv, test_mode};
-
-  FileTest::Options opts;
-  opts.path = argv[3];
-  opts.callback = TestCipher;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc b/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc
deleted file mode 100644
index a27736e..0000000
--- a/src/util/fipstools/cavp/cavp_ctr_drbg_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ctr_drbg_test processes a NIST CAVP DRBG800-90A test vector request
-// file and emits the corresponding response.
-
-#include <openssl/crypto.h>
-
-#include <stdlib.h>
-
-#include "cavp_test_util.h"
-#include "../crypto/fipsmodule/rand/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-
-
-static bool TestCTRDRBG(FileTest *t, void *arg) {
-  std::string test_type, prediction_resistance, entropy_input_len, nonce_len,
-      personalization_str_len, additional_input_len, returned_bits_len;
-  if (!t->GetInstruction(&test_type, "AES-256 no df") ||
-      !t->GetInstruction(&prediction_resistance, "PredictionResistance") ||
-      !t->GetInstruction(&entropy_input_len, "EntropyInputLen") ||
-      !t->GetInstruction(&nonce_len, "NonceLen") ||
-      !t->GetInstruction(&personalization_str_len,
-                         "PersonalizationStringLen") ||
-      !t->GetInstruction(&additional_input_len, "AdditionalInputLen") ||
-      !t->GetInstruction(&returned_bits_len, "ReturnedBitsLen") ||
-      !test_type.empty() ||
-      prediction_resistance != "False" ||
-      strtoul(entropy_input_len.c_str(), nullptr, 0) !=
-          CTR_DRBG_ENTROPY_LEN * 8 ||
-      nonce_len != "0") {
-    return false;
-  }
-
-  std::string count;
-  std::vector<uint8_t> entropy, nonce, personalization_str, ai1, ai2;
-  if (!t->GetAttribute(&count, "COUNT") ||
-      !t->GetBytes(&entropy, "EntropyInput") ||
-      !t->GetBytes(&nonce, "Nonce") ||
-      !t->GetBytes(&personalization_str, "PersonalizationString") ||
-      !t->GetBytes(&ai1, "AdditionalInput") ||
-      !t->GetBytes(&ai2, "AdditionalInput/2") ||
-      entropy.size() * 8 != strtoul(entropy_input_len.c_str(), nullptr, 0) ||
-      nonce.size() != 0 ||
-      personalization_str.size() * 8 !=
-          strtoul(personalization_str_len.c_str(), nullptr, 0) ||
-      ai1.size() != ai2.size() ||
-      ai1.size() * 8 != strtoul(additional_input_len.c_str(), nullptr, 0)) {
-    return false;
-  }
-
-  CTR_DRBG_STATE drbg;
-  CTR_DRBG_init(&drbg, entropy.data(),
-                personalization_str.size() > 0 ? personalization_str.data()
-                                               : nullptr,
-                personalization_str.size());
-
-  uint64_t out_len = strtoul(returned_bits_len.c_str(), nullptr, 0);
-  if (out_len == 0 || (out_len & 7) != 0) {
-    return false;
-  }
-  out_len /= 8;
-
-  std::vector<uint8_t> out;
-  out.resize(out_len);
-
-  CTR_DRBG_generate(&drbg, out.data(), out.size(),
-                    ai1.size() > 0 ? ai1.data() : nullptr, ai1.size());
-  CTR_DRBG_generate(&drbg, out.data(), out.size(),
-                    ai2.size() > 0 ? ai2.data() : nullptr, ai2.size());
-
-  printf("%s", t->CurrentTestToString().c_str());
-  printf("ReturnedBits = %s\r\n\r\n", EncodeHex(out).c_str());
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s <test file>\n", arg);
-  return 1;
-}
-
-int cavp_ctr_drbg_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    return usage(argv[0]);
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestCTRDRBG;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc
deleted file mode 100644
index f8c4a01..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_keypair_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_keypair_test processes a NIST CAVP ECDSA2 KeyPair test vector
-// request file and emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2KeyPair(FileTest *t, void *arg) {
-  std::string n_str;
-  const char *group_str;
-  int nid = GetECGroupNIDFromInstruction(t, &group_str);
-  if (nid == NID_undef ||
-      !t->GetAttribute(&n_str, "N")) {
-    return false;
-  }
-
-  // Don't use CurrentTestToString to avoid printing the N.
-  printf(
-      "[%s]\r\n\r\n[B.4.2 Key Pair Generation by Testing Candidates]\r\n\r\n",
-      group_str);
-
-  unsigned long n = strtoul(n_str.c_str(), nullptr, 10);
-  for (unsigned long i = 0; i < n; i++) {
-    bssl::UniquePtr<BIGNUM> qx(BN_new()), qy(BN_new());
-    bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-    if (!key ||
-        !EC_KEY_generate_key_fips(key.get()) ||
-        !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
-                                             EC_KEY_get0_public_key(key.get()),
-                                             qx.get(), qy.get(), nullptr)) {
-      return false;
-    }
-
-    size_t degree_len =
-        (EC_GROUP_get_degree(EC_KEY_get0_group(key.get())) + 7) / 8;
-    size_t order_len =
-        BN_num_bytes(EC_GROUP_get0_order(EC_KEY_get0_group(key.get())));
-    std::vector<uint8_t> qx_bytes(degree_len), qy_bytes(degree_len);
-    std::vector<uint8_t> d_bytes(order_len);
-    if (!BN_bn2bin_padded(qx_bytes.data(), qx_bytes.size(), qx.get()) ||
-        !BN_bn2bin_padded(qy_bytes.data(), qy_bytes.size(), qy.get()) ||
-        !BN_bn2bin_padded(d_bytes.data(), d_bytes.size(),
-                          EC_KEY_get0_private_key(key.get()))) {
-      return false;
-    }
-
-    printf("d = %s\r\nQx = %s\r\nQy = %s\r\n\r\n", EncodeHex(d_bytes).c_str(),
-           EncodeHex(qx_bytes).c_str(), EncodeHex(qy_bytes).c_str());
-  }
-
-  return true;
-}
-
-int cavp_ecdsa2_keypair_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestECDSA2KeyPair;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc
deleted file mode 100644
index d823e7a..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_pkv_test.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_pkv_test processes a NIST CAVP ECDSA2 PKV test vector request file
-// and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2PKV(FileTest *t, void *arg) {
-  int nid = GetECGroupNIDFromInstruction(t);
-  if (nid == NID_undef) {
-    return false;
-  }
-  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-  bssl::UniquePtr<BIGNUM> qx = GetBIGNUM(t, "Qx");
-  bssl::UniquePtr<BIGNUM> qy = GetBIGNUM(t, "Qy");
-  if (!key || !qx || !qy) {
-    return false;
-  }
-
-  if (EC_KEY_set_public_key_affine_coordinates(key.get(), qx.get(), qy.get())) {
-    printf("%sResult = P\r\n\r\n", t->CurrentTestToString().c_str());
-  } else {
-    char buf[256];
-    ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
-    printf("%sResult = F (%s)\r\n\r\n", t->CurrentTestToString().c_str(), buf);
-  }
-  ERR_clear_error();
-  return true;
-}
-
-int cavp_ecdsa2_pkv_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestECDSA2PKV;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc
deleted file mode 100644
index 1282eaa..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_siggen_test.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_siggen_test processes NIST CAVP ECDSA2 SigGen and
-// SigGenComponent test vector request files and emits the corresponding
-// response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ec_key.h>
-#include <openssl/ecdsa.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2SigGenImpl(FileTest *t, bool is_component) {
-  int nid = GetECGroupNIDFromInstruction(t);
-  const EVP_MD *md = GetDigestFromInstruction(t);
-  if (nid == NID_undef || md == nullptr) {
-    return false;
-  }
-  bssl::UniquePtr<BIGNUM> qx(BN_new()), qy(BN_new());
-  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-  std::vector<uint8_t> msg;
-  if (!qx || !qy || !key ||
-      !EC_KEY_generate_key_fips(key.get()) ||
-      !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
-                                           EC_KEY_get0_public_key(key.get()),
-                                           qx.get(), qy.get(), nullptr) ||
-      !t->GetBytes(&msg, "Msg")) {
-    return false;
-  }
-
-  uint8_t digest[EVP_MAX_MD_SIZE];
-  unsigned digest_len;
-  if (is_component) {
-    if (msg.size() != EVP_MD_size(md)) {
-      t->PrintLine("Bad input length.");
-      return false;
-    }
-    digest_len = EVP_MD_size(md);
-    OPENSSL_memcpy(digest, msg.data(), msg.size());
-  } else if (!EVP_Digest(msg.data(), msg.size(), digest, &digest_len, md,
-                         nullptr)) {
-    return false;
-  }
-
-  bssl::UniquePtr<ECDSA_SIG> sig(ECDSA_do_sign(digest, digest_len, key.get()));
-  if (!sig) {
-    return false;
-  }
-
-  size_t degree_len =
-      (EC_GROUP_get_degree(EC_KEY_get0_group(key.get())) + 7) / 8;
-  size_t order_len =
-      BN_num_bytes(EC_GROUP_get0_order(EC_KEY_get0_group(key.get())));
-  std::vector<uint8_t> qx_bytes(degree_len), qy_bytes(degree_len);
-  std::vector<uint8_t> r_bytes(order_len), s_bytes(order_len);
-  if (!BN_bn2bin_padded(qx_bytes.data(), qx_bytes.size(), qx.get()) ||
-      !BN_bn2bin_padded(qy_bytes.data(), qy_bytes.size(), qy.get()) ||
-      !BN_bn2bin_padded(r_bytes.data(), r_bytes.size(), sig->r) ||
-      !BN_bn2bin_padded(s_bytes.data(), s_bytes.size(), sig->s)) {
-    return false;
-  }
-
-  printf("%sQx = %s\r\nQy = %s\r\nR = %s\r\nS = %s\r\n\r\n",
-         t->CurrentTestToString().c_str(), EncodeHex(qx_bytes).c_str(),
-         EncodeHex(qy_bytes).c_str(), EncodeHex(r_bytes).c_str(),
-         EncodeHex(s_bytes).c_str());
-  return true;
-}
-
-static bool TestECDSA2SigGen(FileTest *t, void *arg) {
-  return TestECDSA2SigGenImpl(t, false);
-}
-
-static bool TestECDSA2SigGenComponent(FileTest *t, void *arg) {
-  return TestECDSA2SigGenImpl(t, true);
-}
-
-int cavp_ecdsa2_siggen_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    fprintf(stderr, "usage: %s (SigGen|SigGenComponent) <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  static bool (*test_func)(FileTest *, void *);
-  if (strcmp(argv[1], "SigGen") == 0) {
-    test_func = TestECDSA2SigGen;
-  } else if (strcmp(argv[1], "SigGenComponent") == 0) {
-    test_func = TestECDSA2SigGenComponent;
-  } else {
-    fprintf(stderr, "Unknown test type: %s\n", argv[1]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.callback = test_func;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc b/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc
deleted file mode 100644
index f3fd4b1..0000000
--- a/src/util/fipstools/cavp/cavp_ecdsa2_sigver_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_ecdsa2_sigver_test processes a NIST CAVP ECDSA2 SigVer test vector
-// request file and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ec_key.h>
-#include <openssl/ecdsa.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-static bool TestECDSA2SigVer(FileTest *t, void *arg) {
-  int nid = GetECGroupNIDFromInstruction(t);
-  const EVP_MD *md = GetDigestFromInstruction(t);
-  if (nid == NID_undef || md == nullptr) {
-    return false;
-  }
-  bssl::UniquePtr<ECDSA_SIG> sig(ECDSA_SIG_new());
-  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-  bssl::UniquePtr<BIGNUM> qx = GetBIGNUM(t, "Qx");
-  bssl::UniquePtr<BIGNUM> qy = GetBIGNUM(t, "Qy");
-  bssl::UniquePtr<BIGNUM> r = GetBIGNUM(t, "R");
-  bssl::UniquePtr<BIGNUM> s = GetBIGNUM(t, "S");
-  std::vector<uint8_t> msg;
-  uint8_t digest[EVP_MAX_MD_SIZE];
-  unsigned digest_len;
-  if (!sig || !key || !qx || !qy || !r || !s ||
-      !EC_KEY_set_public_key_affine_coordinates(key.get(), qx.get(),
-                                                qy.get()) ||
-      !t->GetBytes(&msg, "Msg") ||
-      !EVP_Digest(msg.data(), msg.size(), digest, &digest_len, md, nullptr)) {
-    return false;
-  }
-
-  BN_free(sig->r);
-  sig->r = r.release();
-  BN_free(sig->s);
-  sig->s = s.release();
-
-  if (ECDSA_do_verify(digest, digest_len, sig.get(), key.get())) {
-    printf("%sResult = P\r\n\r\n", t->CurrentTestToString().c_str());
-  } else {
-    char buf[256];
-    ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
-    printf("%sResult = F (%s)\r\n\r\n", t->CurrentTestToString().c_str(), buf);
-  }
-  ERR_clear_error();
-  return true;
-}
-
-int cavp_ecdsa2_sigver_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestECDSA2SigVer;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_hmac_test.cc b/src/util/fipstools/cavp/cavp_hmac_test.cc
deleted file mode 100644
index c88226a..0000000
--- a/src/util/fipstools/cavp/cavp_hmac_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_hmac_test processes a NIST CAVP HMAC test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/hmac.h>
-#include <openssl/span.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestHMAC(FileTest *t, void *arg) {
-  std::string md_len_str;
-  if (!t->GetInstruction(&md_len_str, "L")) {
-    return false;
-  }
-  const size_t md_len = strtoul(md_len_str.c_str(), nullptr, 0);
-
-  const EVP_MD *md;
-  switch (md_len) {
-    case 20:
-      md = EVP_sha1();
-      break;
-    case 28:
-      md = EVP_sha224();
-      break;
-    case 32:
-      md = EVP_sha256();
-      break;
-    case 48:
-      md = EVP_sha384();
-      break;
-    case 64:
-      md = EVP_sha512();
-      break;
-    default:
-      return false;
-  }
-
-  std::string count_str, k_len_str, t_len_str;
-  std::vector<uint8_t> key, msg;
-  if (!t->GetAttribute(&count_str, "Count") ||
-      !t->GetAttribute(&k_len_str, "Klen") ||
-      !t->GetAttribute(&t_len_str, "Tlen") ||
-      !t->GetBytes(&key, "Key") ||
-      !t->GetBytes(&msg, "Msg")) {
-    return false;
-  }
-
-  size_t k_len = strtoul(k_len_str.c_str(), nullptr, 0);
-  size_t t_len = strtoul(t_len_str.c_str(), nullptr, 0);
-  if (key.size() < k_len) {
-    return false;
-  }
-  unsigned out_len;
-  uint8_t out[EVP_MAX_MD_SIZE];
-  if (HMAC(md, key.data(), k_len, msg.data(), msg.size(), out, &out_len) ==
-      NULL) {
-    return false;
-  }
-
-  if (out_len < t_len) {
-    return false;
-  }
-
-  printf("%s", t->CurrentTestToString().c_str());
-  printf("Mac = %s\r\n\r\n",
-         EncodeHex(bssl::MakeConstSpan(out, t_len)).c_str());
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s <test file>\n", arg);
-  return 1;
-}
-
-int cavp_hmac_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    return usage(argv[0]);
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestHMAC;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_kas_test.cc b/src/util/fipstools/cavp/cavp_kas_test.cc
deleted file mode 100644
index 9a74f1d..0000000
--- a/src/util/fipstools/cavp/cavp_kas_test.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2018, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_kas_test processes NIST CAVP ECC KAS test vector request files and
-// emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/ecdh.h>
-#include <openssl/ecdsa.h>
-#include <openssl/ec_key.h>
-#include <openssl/err.h>
-#include <openssl/nid.h>
-#include <openssl/sha.h>
-#include <openssl/span.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestKAS(FileTest *t, void *arg) {
-  const bool validate = *reinterpret_cast<bool *>(arg);
-
-  int nid = NID_undef;
-  size_t digest_len = 0;
-
-  if (t->HasInstruction("EB - SHA224")) {
-    nid = NID_secp224r1;
-    digest_len = SHA224_DIGEST_LENGTH;
-  } else if (t->HasInstruction("EC - SHA256")) {
-    nid = NID_X9_62_prime256v1;
-    digest_len = SHA256_DIGEST_LENGTH;
-  } else if (t->HasInstruction("ED - SHA384")) {
-    nid = NID_secp384r1;
-    digest_len = SHA384_DIGEST_LENGTH;
-  } else if (t->HasInstruction("EE - SHA512")) {
-    nid = NID_secp521r1;
-    digest_len = SHA512_DIGEST_LENGTH;
-  } else {
-    return false;
-  }
-
-  if (!t->HasAttribute("COUNT")) {
-    return false;
-  }
-
-  bssl::UniquePtr<BIGNUM> their_x(GetBIGNUM(t, "QeCAVSx"));
-  bssl::UniquePtr<BIGNUM> their_y(GetBIGNUM(t, "QeCAVSy"));
-  bssl::UniquePtr<EC_KEY> ec_key(EC_KEY_new_by_curve_name(nid));
-  bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
-  if (!their_x || !their_y || !ec_key || !ctx) {
-    return false;
-  }
-
-  const EC_GROUP *const group = EC_KEY_get0_group(ec_key.get());
-  bssl::UniquePtr<EC_POINT> their_point(EC_POINT_new(group));
-  if (!their_point ||
-      !EC_POINT_set_affine_coordinates_GFp(
-          group, their_point.get(), their_x.get(), their_y.get(), ctx.get())) {
-    return false;
-  }
-
-  if (validate) {
-    bssl::UniquePtr<BIGNUM> our_k(GetBIGNUM(t, "deIUT"));
-    if (!our_k ||
-        !EC_KEY_set_private_key(ec_key.get(), our_k.get()) ||
-        // These attributes are ignored.
-        !t->HasAttribute("QeIUTx") ||
-        !t->HasAttribute("QeIUTy")) {
-      return false;
-    }
-  } else if (!EC_KEY_generate_key(ec_key.get())) {
-    return false;
-  }
-
-  uint8_t digest[EVP_MAX_MD_SIZE];
-  if (!ECDH_compute_key_fips(digest, digest_len, their_point.get(),
-                             ec_key.get())) {
-    return false;
-  }
-
-  if (validate) {
-    std::vector<uint8_t> expected_shared_bytes;
-    if (!t->GetBytes(&expected_shared_bytes, "CAVSHashZZ")) {
-      return false;
-    }
-    const bool ok =
-        digest_len == expected_shared_bytes.size() &&
-        OPENSSL_memcmp(digest, expected_shared_bytes.data(), digest_len) == 0;
-
-    printf("%sIUTHashZZ = %s\r\nResult = %c\r\n\r\n\r\n",
-           t->CurrentTestToString().c_str(),
-           EncodeHex(bssl::MakeConstSpan(digest, digest_len)).c_str(),
-           ok ? 'P' : 'F');
-  } else {
-    const EC_POINT *pub = EC_KEY_get0_public_key(ec_key.get());
-    bssl::UniquePtr<BIGNUM> x(BN_new());
-    bssl::UniquePtr<BIGNUM> y(BN_new());
-    if (!x || !y ||
-        !EC_POINT_get_affine_coordinates_GFp(group, pub, x.get(), y.get(),
-                                             ctx.get())) {
-      return false;
-    }
-    bssl::UniquePtr<char> x_hex(BN_bn2hex(x.get()));
-    bssl::UniquePtr<char> y_hex(BN_bn2hex(y.get()));
-
-    printf("%sQeIUTx = %s\r\nQeIUTy = %s\r\nHashZZ = %s\r\n",
-           t->CurrentTestToString().c_str(), x_hex.get(), y_hex.get(),
-           EncodeHex(bssl::MakeConstSpan(digest, digest_len)).c_str());
-  }
-
-  return true;
-}
-
-int cavp_kas_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    fprintf(stderr, "usage: %s (validity|function) <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  bool validity;
-  if (strcmp(argv[1], "validity") == 0) {
-    validity = true;
-  } else if (strcmp(argv[1], "function") == 0) {
-    validity = false;
-  } else {
-    fprintf(stderr, "Unknown test type: %s\n", argv[1]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.arg = &validity;
-  opts.callback = TestKAS;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  opts.is_kas_test = true;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_keywrap_test.cc b/src/util/fipstools/cavp/cavp_keywrap_test.cc
deleted file mode 100644
index 67397ec..0000000
--- a/src/util/fipstools/cavp/cavp_keywrap_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_keywrap_test processes a NIST CAVP AES test vector request file and
-// emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/aes.h>
-#include <openssl/crypto.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  bool encrypt;
-  bool padding;
-};
-
-}  // namespace
-
-static bool AESKeyWrap(std::vector<uint8_t> *out, bool encrypt,
-                       const std::vector<uint8_t> &key,
-                       const std::vector<uint8_t> &in) {
-  size_t key_bits = key.size() * 8;
-  if (key_bits != 128 && key_bits != 192 && key_bits != 256) {
-    return false;
-  }
-  AES_KEY aes_key;
-
-  if (encrypt) {
-    out->resize(in.size() + 8);
-    if (AES_set_encrypt_key(key.data(), key_bits, &aes_key) ||
-        AES_wrap_key(&aes_key, nullptr, out->data(), in.data(), in.size()) ==
-            -1) {
-      return false;
-    }
-  } else {
-    out->resize(in.size() - 8);
-    if (AES_set_decrypt_key(key.data(), key_bits, &aes_key) ||
-        AES_unwrap_key(&aes_key, nullptr, out->data(), in.data(), in.size()) ==
-            -1) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-static bool AESKeyWrapWithPadding(std::vector<uint8_t> *out, bool encrypt,
-                                  const std::vector<uint8_t> &key,
-                                  const std::vector<uint8_t> &in) {
-  const size_t key_bits = key.size() * 8;
-  if (key_bits != 128 && key_bits != 192 && key_bits != 256) {
-    return false;
-  }
-  AES_KEY aes_key;
-
-  size_t out_len;
-  if (encrypt) {
-    out->resize(in.size() + 15);
-    if (AES_set_encrypt_key(key.data(), key_bits, &aes_key) ||
-        !AES_wrap_key_padded(&aes_key, out->data(), &out_len, out->size(),
-                             in.data(), in.size())) {
-      return false;
-    }
-  } else {
-    out->resize(in.size());
-    if (AES_set_decrypt_key(key.data(), key_bits, &aes_key) ||
-        !AES_unwrap_key_padded(&aes_key, out->data(), &out_len, out->size(),
-                               in.data(), in.size())) {
-      return false;
-    }
-  }
-
-  out->resize(out_len);
-  return true;
-}
-
-static bool TestCipher(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  std::string count, unused, in_label = ctx->encrypt ? "P" : "C",
-                             result_label = ctx->encrypt ? "C" : "P";
-  std::vector<uint8_t> key, in, result;
-  // clang-format off
-  if (!t->GetInstruction(&unused, "PLAINTEXT LENGTH") ||
-      !t->GetAttribute(&count, "COUNT") ||
-      !t->GetBytes(&key, "K") ||
-      !t->GetBytes(&in, in_label)) {
-    return false;
-  }
-  // clang-format on
-
-  auto wrap_function = AESKeyWrap;
-  if (ctx->padding) {
-    wrap_function = AESKeyWrapWithPadding;
-  }
-
-  printf("%s", t->CurrentTestToString().c_str());
-  if (!wrap_function(&result, ctx->encrypt, key, in)) {
-    if (ctx->encrypt) {
-      return false;
-    } else {
-      printf("FAIL\r\n\r\n");
-    }
-  } else {
-    printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-  }
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(
-      stderr,
-      "usage: %s (enc|dec|enc-pad|dec-pad) (128|192|256) <test file>\n",
-      arg);
-  return 1;
-}
-
-int cavp_keywrap_test_main(int argc, char **argv) {
-  if (argc != 4) {
-    return usage(argv[0]);
-  }
-
-  const std::string op(argv[1]);
-  bool encrypt = false;
-  bool padding = false;
-  if (op == "enc") {
-    encrypt = true;
-  } else if (op == "dec") {
-  } else if (op == "enc-pad") {
-    encrypt = true;
-    padding = true;
-  } else if (op == "dec-pad") {
-    padding = true;
-  } else {
-    return usage(argv[0]);
-  }
-
-  TestCtx ctx = {encrypt, padding};
-
-  FileTest::Options opts;
-  opts.path = argv[3];
-  opts.callback = TestCipher;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_main.cc b/src/util/fipstools/cavp/cavp_main.cc
deleted file mode 100644
index 64dbd69..0000000
--- a/src/util/fipstools/cavp/cavp_main.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_main is a wrapper that invokes the main entry function of one of the
-// CAVP validation suite binaries.
-
-#include <stdlib.h>
-#include <cstdio>
-#include <string>
-
-#include <openssl/crypto.h>
-
-#include "cavp_test_util.h"
-
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s <validation suite> <args ...>\n", arg);
-  return 1;
-}
-
-struct TestSuite {
-  std::string name;
-  int (*main_func)(int argc, char **argv);
-};
-
-static TestSuite all_test_suites[] = {
-    {"aes", &cavp_aes_test_main},
-    {"aes_gcm", &cavp_aes_gcm_test_main},
-    {"ctr_drbg", &cavp_ctr_drbg_test_main},
-    {"ecdsa2_keypair", &cavp_ecdsa2_keypair_test_main},
-    {"ecdsa2_pkv", &cavp_ecdsa2_pkv_test_main},
-    {"ecdsa2_siggen", &cavp_ecdsa2_siggen_test_main},
-    {"ecdsa2_sigver", &cavp_ecdsa2_sigver_test_main},
-    {"hmac", &cavp_hmac_test_main},
-    {"kas", &cavp_kas_test_main},
-    {"keywrap", &cavp_keywrap_test_main},
-    {"rsa2_keygen", &cavp_rsa2_keygen_test_main},
-    {"rsa2_siggen", &cavp_rsa2_siggen_test_main},
-    {"rsa2_sigver", &cavp_rsa2_sigver_test_main},
-    {"tlskdf", &cavp_tlskdf_test_main},
-    {"sha", &cavp_sha_test_main},
-    {"sha_monte", &cavp_sha_monte_test_main},
-    {"tdes", &cavp_tdes_test_main}
-};
-
-int main(int argc, char **argv) {
-  CRYPTO_library_init();
-
-  if (argc < 3) {
-    return usage(argv[0]);
-  }
-
-  const std::string suite(argv[1]);
-  for (const TestSuite &s : all_test_suites) {
-    if (s.name == suite) {
-      return s.main_func(argc - 1, &argv[1]);
-    }
-  }
-
-  fprintf(stderr, "invalid test suite: %s\n\n", argv[1]);
-  return usage(argv[0]);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc b/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc
deleted file mode 100644
index e7088c7..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_keygen_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_keygen_test processes NIST CAVP RSA2 KeyGen test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-static bool TestRSA2KeyGen(FileTest *t, void *arg) {
-  std::string mod_str, table, count_str;
-  if (!t->GetInstruction(&mod_str, "mod") ||
-      !t->GetInstruction(&table, "Table for M-R Test") ||
-      table != "C.2" ||
-      !t->GetAttribute(&count_str, "N")) {
-    return false;
-  }
-
-  printf("[mod = %s]\r\n", mod_str.c_str());
-  printf("[Table for M-R Test = %s]\r\n\r\n", table.c_str());
-
-  size_t bits = strtoul(mod_str.c_str(), nullptr, 0);
-  size_t count = strtoul(count_str.c_str(), nullptr, 0);
-  for (size_t i = 0; i < count; i++) {
-    bssl::UniquePtr<RSA> key(RSA_new());
-    if (key == nullptr ||
-        bits == 0 ||
-        !RSA_generate_key_fips(key.get(), bits, nullptr)) {
-      return 0;
-    }
-
-    const BIGNUM *n, *e, *d, *p, *q;
-    RSA_get0_key(key.get(), &n, &e, &d);
-    RSA_get0_factors(key.get(), &p, &q);
-    std::vector<uint8_t> n_bytes(BN_num_bytes(n)), e_bytes(BN_num_bytes(e)),
-        d_bytes((bits + 7) / 8), p_bytes(BN_num_bytes(p)),
-        q_bytes(BN_num_bytes(q));
-    if (n == NULL ||
-        BN_bn2bin(n, n_bytes.data()) != n_bytes.size() ||
-        e == NULL ||
-        BN_bn2bin(e, e_bytes.data()) != e_bytes.size() ||
-        d == NULL ||
-        !BN_bn2bin_padded(d_bytes.data(), d_bytes.size(), d) ||
-        p == NULL ||
-        BN_bn2bin(p, p_bytes.data()) != p_bytes.size() ||
-        q == NULL ||
-        BN_bn2bin(q, q_bytes.data()) != q_bytes.size()) {
-      return false;
-    }
-
-    printf("e = %s\r\np = %s\r\nq = %s\r\nn = %s\r\nd = %s\r\n\r\n",
-           EncodeHex(e_bytes).c_str(), EncodeHex(p_bytes).c_str(),
-           EncodeHex(q_bytes).c_str(), EncodeHex(n_bytes).c_str(),
-           EncodeHex(d_bytes).c_str());
-  }
-
-  return true;
-}
-
-int cavp_rsa2_keygen_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestRSA2KeyGen;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc b/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc
deleted file mode 100644
index 636a73a..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_siggen_test.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_siggen_test processes NIST CAVP RSA2 SigGen test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-namespace {
-
-struct TestCtx {
-  bssl::UniquePtr<RSA> key;
-  bool is_pss;
-};
-
-}
-
-static bool TestRSA2SigGen(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  std::string mod_str, hash;
-  std::vector<uint8_t> msg;
-  if (!t->GetInstruction(&mod_str, "mod") ||
-      !t->GetAttribute(&hash, "SHAAlg") ||
-      !t->GetBytes(&msg, "Msg")) {
-    return false;
-  }
-
-  std::string test = t->CurrentTestToString();
-  if (t->IsAtNewInstructionBlock()) {
-    int mod_bits = strtoul(mod_str.c_str(), nullptr, 0);
-    ctx->key = bssl::UniquePtr<RSA>(RSA_new());
-    if (ctx->key == nullptr ||
-        mod_bits == 0 ||
-        !RSA_generate_key_fips(ctx->key.get(), mod_bits, nullptr)) {
-      return false;
-    }
-
-    const BIGNUM *n, *e;
-    RSA_get0_key(ctx->key.get(), &n, &e, nullptr);
-
-    std::vector<uint8_t> n_bytes(BN_num_bytes(n));
-    std::vector<uint8_t> e_bytes(BN_num_bytes(e));
-    if (!BN_bn2bin_padded(n_bytes.data(), n_bytes.size(), n) ||
-        !BN_bn2bin_padded(e_bytes.data(), e_bytes.size(), e)) {
-      return false;
-    }
-
-    printf("[mod = %s]\r\n\r\nn = %s\r\n\r\ne = %s", mod_str.c_str(),
-           EncodeHex(n_bytes).c_str(), EncodeHex(e_bytes).c_str());
-    test = test.substr(test.find("]") + 3);
-  }
-
-  const EVP_MD *md = EVP_get_digestbyname(hash.c_str());
-  uint8_t digest_buf[EVP_MAX_MD_SIZE];
-  std::vector<uint8_t> sig(RSA_size(ctx->key.get()));
-  unsigned digest_len;
-  size_t sig_len;
-  if (md == NULL ||
-      !EVP_Digest(msg.data(), msg.size(), digest_buf, &digest_len, md, NULL)) {
-    return false;
-  }
-
-  if (ctx->is_pss) {
-    if (!RSA_sign_pss_mgf1(ctx->key.get(), &sig_len, sig.data(), sig.size(),
-                           digest_buf, digest_len, md, md, -1)) {
-      return false;
-    }
-  } else {
-    unsigned sig_len_u;
-    if (!RSA_sign(EVP_MD_type(md), digest_buf, digest_len, sig.data(),
-                  &sig_len_u, ctx->key.get())) {
-      return false;
-    }
-    sig_len = sig_len_u;
-  }
-
-  sig.resize(sig_len);
-  printf("%sS = %s\r\n\r\n", test.c_str(), EncodeHex(sig).c_str());
-  return true;
-}
-
-int cavp_rsa2_siggen_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    fprintf(stderr, "usage: %s (pkcs15|pss) <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  TestCtx ctx;
-  if (strcmp(argv[1], "pkcs15") == 0) {
-    ctx = {nullptr, false};
-  } else if (strcmp(argv[1], "pss") == 0) {
-    ctx = {nullptr, true};
-  } else {
-    fprintf(stderr, "Unknown test type: %s\n", argv[1]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.callback = TestRSA2SigGen;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc b/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc
deleted file mode 100644
index cbcfc1f..0000000
--- a/src/util/fipstools/cavp/cavp_rsa2_sigver_test.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_rsa2_sigver_test processes NIST CAVP RSA2 SigVer test vector request
-// files and emits the corresponding response.
-
-#include <vector>
-
-#include <openssl/bn.h>
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-#include <openssl/err.h>
-#include <openssl/rsa.h>
-
-#include "../crypto/internal.h"
-#include "../crypto/test/file_test.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  std::vector<uint8_t> N;
-  bool is_pss;
-};
-
-}
-
-static bool TestRSA2SigVer(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  std::string mod_str;
-  if (!t->GetInstruction(&mod_str, "mod")) {
-    return false;
-  }
-
-  printf("%s", t->CurrentTestToString().c_str());
-
-  if (t->HasAttribute("n")) {
-    printf("\r\n");
-    return t->GetBytes(&ctx->N, "n");
-  }
-
-  std::string hash;
-  std::vector<uint8_t> e_bytes, msg, sig;
-  if (!t->GetAttribute(&hash, "SHAAlg") ||
-      !t->GetBytes(&e_bytes, "e") ||
-      !t->GetBytes(&msg, "Msg") ||
-      !t->GetBytes(&sig, "S")) {
-    return false;
-  }
-
-  bssl::UniquePtr<RSA> key(RSA_new());
-  key->n = BN_new();
-  key->e = BN_new();
-  if (key == nullptr ||
-      !BN_bin2bn(ctx->N.data(), ctx->N.size(), key->n) ||
-      !BN_bin2bn(e_bytes.data(), e_bytes.size(), key->e)) {
-    return false;
-  }
-
-  const EVP_MD *md = EVP_get_digestbyname(hash.c_str());
-  uint8_t digest_buf[EVP_MAX_MD_SIZE];
-  unsigned digest_len;
-  if (md == NULL ||
-      !EVP_Digest(msg.data(), msg.size(), digest_buf, &digest_len, md, NULL)) {
-    return false;
-  }
-
-  int ok;
-  if (ctx->is_pss) {
-    ok = RSA_verify_pss_mgf1(key.get(), digest_buf, digest_len, md, md, -1,
-                             sig.data(), sig.size());
-  } else {
-    ok = RSA_verify(EVP_MD_type(md), digest_buf, digest_len, sig.data(),
-                    sig.size(), key.get());
-  }
-
-  if (ok) {
-    printf("Result = P\r\n\r\n");
-  } else {
-    char buf[256];
-    ERR_error_string_n(ERR_get_error(), buf, sizeof(buf));
-    printf("Result = F (%s)\r\n\r\n", buf);
-  }
-  ERR_clear_error();
-  return true;
-}
-
-int cavp_rsa2_sigver_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    fprintf(stderr, "usage: %s (pkcs15|pss) <test file>\n",
-            argv[0]);
-    return 1;
-  }
-
-  TestCtx ctx;
-  if (strcmp(argv[1], "pkcs15") == 0) {
-    ctx = {std::vector<uint8_t>(), false};
-  } else if (strcmp(argv[1], "pss") == 0) {
-    ctx = {std::vector<uint8_t>(), true};
-  } else {
-    fprintf(stderr, "Unknown test type: %s\n", argv[1]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.callback = TestRSA2SigVer;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_sha_monte_test.cc b/src/util/fipstools/cavp/cavp_sha_monte_test.cc
deleted file mode 100644
index f5bcdd1..0000000
--- a/src/util/fipstools/cavp/cavp_sha_monte_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_sha_monte_test processes a NIST CAVP SHA-Monte test vector request file
-// and emits the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  std::string hash;
-};
-
-}
-
-static bool TestSHAMonte(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  const EVP_MD *md = EVP_get_digestbyname(ctx->hash.c_str());
-  if (md == nullptr) {
-    return false;
-  }
-  const size_t md_len = EVP_MD_size(md);
-
-  std::string out_len;
-  if (!t->GetInstruction(&out_len, "L") ||
-      md_len != strtoul(out_len.c_str(), nullptr, 0)) {
-    return false;
-  }
-
-  std::vector<uint8_t> seed;
-  if (!t->GetBytes(&seed, "Seed") ||
-      seed.size() != md_len) {
-    return false;
-  }
-
-  std::vector<uint8_t> out = seed;
-
-  printf("%s\r\n", t->CurrentTestToString().c_str());
-
-  for (int count = 0; count < 100; count++) {
-    std::vector<uint8_t> msg;
-    msg.insert(msg.end(), out.begin(), out.end());
-    msg.insert(msg.end(), out.begin(), out.end());
-    msg.insert(msg.end(), out.begin(), out.end());
-    for (int i = 0; i < 1000; i++) {
-      unsigned digest_len;
-      if (!EVP_Digest(msg.data(), msg.size(), out.data(), &digest_len, md,
-                      nullptr) ||
-          digest_len != out.size()) {
-        return false;
-      }
-
-      msg.erase(msg.begin(), msg.begin() + out.size());
-      msg.insert(msg.end(), out.begin(), out.end());
-    }
-    printf("COUNT = %d\r\n", count);
-    printf("MD = %s\r\n\r\n", EncodeHex(out).c_str());
-  }
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s <hash> <test file>\n", arg);
-  return 1;
-}
-
-int cavp_sha_monte_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    return usage(argv[0]);
-  }
-
-  TestCtx ctx = {std::string(argv[1])};
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.callback = TestSHAMonte;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_sha_test.cc b/src/util/fipstools/cavp/cavp_sha_test.cc
deleted file mode 100644
index c046451..0000000
--- a/src/util/fipstools/cavp/cavp_sha_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_sha_test processes a NIST CAVP SHA test vector request file and emits
-// the corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/crypto.h>
-#include <openssl/digest.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-namespace {
-
-struct TestCtx {
-  std::string hash;
-};
-
-}
-
-static bool TestSHA(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  const EVP_MD *md = EVP_get_digestbyname(ctx->hash.c_str());
-  if (md == nullptr) {
-    return false;
-  }
-  const size_t md_len = EVP_MD_size(md);
-
-  std::string out_len;
-  if (!t->GetInstruction(&out_len, "L") ||
-      md_len != strtoul(out_len.c_str(), nullptr, 0)) {
-    return false;
-  }
-
-  std::string msg_len_str;
-  std::vector<uint8_t> msg;
-  if (!t->GetAttribute(&msg_len_str, "Len") ||
-      !t->GetBytes(&msg, "Msg")) {
-    return false;
-  }
-
-  size_t msg_len = strtoul(msg_len_str.c_str(), nullptr, 0);
-  if (msg_len % 8 != 0 ||
-      msg_len / 8 > msg.size()) {
-    return false;
-  }
-  msg_len /= 8;
-
-  std::vector<uint8_t> out;
-  out.resize(md_len);
-  unsigned digest_len;
-  if (!EVP_Digest(msg.data(), msg_len, out.data(), &digest_len, md, nullptr) ||
-      digest_len != out.size()) {
-    return false;
-  }
-
-  printf("%s", t->CurrentTestToString().c_str());
-  printf("MD = %s\r\n\r\n", EncodeHex(out).c_str());
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s <hash> <test file>\n", arg);
-  return 1;
-}
-
-int cavp_sha_test_main(int argc, char **argv) {
-  if (argc != 3) {
-    return usage(argv[0]);
-  }
-
-  TestCtx ctx = {std::string(argv[1])};
-
-  FileTest::Options opts;
-  opts.path = argv[2];
-  opts.callback = TestSHA;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_tdes_test.cc b/src/util/fipstools/cavp/cavp_tdes_test.cc
deleted file mode 100644
index 7b8839d..0000000
--- a/src/util/fipstools/cavp/cavp_tdes_test.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_tdes_test processes a NIST TMOVS test vector request file and emits the
-// corresponding response.
-
-#include <stdlib.h>
-
-#include <openssl/cipher.h>
-#include <openssl/crypto.h>
-#include <openssl/err.h>
-
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-#include "cavp_test_util.h"
-
-
-namespace {
-
-struct TestCtx {
-  const EVP_CIPHER *cipher;
-  enum Mode {
-    kKAT,  // Known Answer Test
-    kMCT,  // Monte Carlo Test
-  };
-  bool has_iv;
-  Mode mode;
-};
-
-}
-
-static bool TestKAT(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
-    t->PrintLine("Want either ENCRYPT or DECRYPT");
-    return false;
-  }
-  enum {
-    kEncrypt,
-    kDecrypt,
-  } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
-  if (t->HasAttribute("NumKeys")) {
-    // Another file format quirk: NumKeys is a single attribute line immediately
-    // following an instruction and should probably have been an instruction
-    // instead. If it is present, the file has separate attributes "KEY{1,2,3}".
-    // If it is not, the keys are concatenated in a single attribute "KEYs".
-    std::string num_keys;
-    t->GetAttribute(&num_keys, "NumKeys");
-    t->InjectInstruction("NumKeys", num_keys);
-
-    std::string header = operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
-    printf("%s\r\n\r\n", header.c_str());
-
-    return true;
-  }
-
-  enum {
-    kNotPresent,
-    kTwo,
-    kThree,
-  } num_keys = kNotPresent;
-  if (t->HasInstruction("NumKeys")) {
-    std::string num_keys_str;
-    t->GetInstruction(&num_keys_str, "NumKeys");
-    const int n = strtoul(num_keys_str.c_str(), nullptr, 0);
-    if (n == 2) {
-      num_keys = kTwo;
-    } else if (n == 3) {
-      num_keys = kThree;
-    } else {
-      t->PrintLine("invalid NumKeys value");
-      return false;
-    }
-  }
-
-  std::string count;
-  std::vector<uint8_t> keys, key1, key2, key3, iv, in, result;
-  const std::string in_label =
-      operation == kEncrypt ? "PLAINTEXT" : "CIPHERTEXT";
-  // clang-format off
-  if (!t->GetAttribute(&count, "COUNT") ||
-      (num_keys == 0 && !t->GetBytes(&keys, "KEYs")) ||
-      (num_keys > 0 &&
-       (!t->GetBytes(&key1, "KEY1") ||
-        !t->GetBytes(&key2, "KEY2") ||
-        !t->GetBytes(&key3, "KEY3"))) ||
-      (ctx->has_iv && !t->GetBytes(&iv, "IV")) ||
-      !t->GetBytes(&in, in_label)) {
-    return false;
-  }
-  // clang-format on
-  std::vector<uint8_t> key;
-  if (num_keys != kNotPresent) {
-    key.insert(key.end(), key1.begin(), key1.end());
-    key.insert(key.end(), key2.begin(), key2.end());
-    if (num_keys == kThree) {
-      key.insert(key.end(), key3.begin(), key3.end());
-    }
-  } else {
-    key.insert(key.end(), keys.begin(), keys.end());
-    key.insert(key.end(), keys.begin(), keys.end());
-    key.insert(key.end(), keys.begin(), keys.end());
-  }
-
-  if (!CipherOperation(ctx->cipher, &result, operation == kEncrypt, key, iv,
-                       in)) {
-    return false;
-  }
-
-  // TDES fax files output format differs from file to file, and the input
-  // format is inconsistent with the output, so we construct the output manually
-  // rather than printing CurrentTestToString().
-  if (t->IsAtNewInstructionBlock() && num_keys == kNotPresent) {
-    // If NumKeys is present, header is printed when parsing NumKeys.
-    std::string header = operation == kEncrypt ? "[ENCRYPT]" : "[DECRYPT]";
-    printf("%s\r\n", header.c_str());
-  }
-  const std::string result_label =
-      operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
-  printf("COUNT = %s\r\n", count.c_str());
-  if (num_keys == kNotPresent) {
-    printf("KEYs = %s\r\n", EncodeHex(keys).c_str());
-  } else {
-    printf("KEY1 = %s\r\nKEY2 = %s\r\nKEY3 = %s\r\n", EncodeHex(key1).c_str(),
-           EncodeHex(key2).c_str(), EncodeHex(key3).c_str());
-  }
-  if (ctx->has_iv) {
-    printf("IV = %s\r\n", EncodeHex(iv).c_str());
-  }
-  printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
-  printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
-  return true;
-}
-
-// XORKeyWithOddParityLSB sets |*key| to |key| XOR |value| and then writes
-// the LSB of each byte to establish odd parity for that byte. This parity-based
-// embedded of a DES key into 64 bits is an old tradition and something that
-// NIST's tests require.
-static void XORKeyWithOddParityLSB(std::vector<uint8_t> *key,
-                                        const std::vector<uint8_t> &value) {
-  for (size_t i = 0; i < key->size(); i++) {
-    uint8_t v = (*key)[i] ^ value[i];
-
-    // Use LSB to establish odd parity.
-    v |= 0x01;
-    for (uint8_t j = 1; j < 8; j++) {
-      v ^= ((v >> j) & 0x01);
-    }
-    (*key)[i] = v;
-  }
-}
-
-static bool TestMCT(FileTest *t, void *arg) {
-  TestCtx *ctx = reinterpret_cast<TestCtx *>(arg);
-
-  if (t->HasInstruction("ENCRYPT") == t->HasInstruction("DECRYPT")) {
-    t->PrintLine("Want either ENCRYPT or DECRYPT");
-    return false;
-  }
-  enum {
-    kEncrypt,
-    kDecrypt,
-  } operation = t->HasInstruction("ENCRYPT") ? kEncrypt : kDecrypt;
-
-  if (t->HasAttribute("NumKeys")) {
-    // Another file format quirk: NumKeys is a single attribute line immediately
-    // following an instruction and should probably have been an instruction
-    // instead.
-    std::string num_keys;
-    t->GetAttribute(&num_keys, "NumKeys");
-    t->InjectInstruction("NumKeys", num_keys);
-    return true;
-  }
-
-  enum {
-    kTwo,
-    kThree,
-  } num_keys;
-  std::string num_keys_str;
-  if (!t->GetInstruction(&num_keys_str, "NumKeys")) {
-    return false;
-  } else {
-    const int n = strtoul(num_keys_str.c_str(), nullptr, 0);
-    if (n == 2) {
-      num_keys = kTwo;
-    } else if (n == 3) {
-      num_keys = kThree;
-    } else {
-      t->PrintLine("invalid NumKeys value");
-      return false;
-    }
-  }
-
-  std::string count;
-  std::vector<uint8_t> key1, key2, key3, iv, in, result;
-  const std::string in_label =
-      operation == kEncrypt ? "PLAINTEXT" : "CIPHERTEXT";
-  // clang-format off
-  if (!t->GetBytes(&key1, "KEY1") ||
-      !t->GetBytes(&key2, "KEY2") ||
-      !t->GetBytes(&key3, "KEY3") ||
-      (ctx->has_iv && !t->GetBytes(&iv, "IV")) ||
-      !t->GetBytes(&in, in_label)) {
-    return false;
-  }
-  // clang-format on
-
-  for (int i = 0; i < 400; i++) {
-    std::vector<uint8_t> current_iv = iv, current_in = in, prev_result,
-                         prev_prev_result;
-
-    std::vector<uint8_t> key(key1);
-    key.insert(key.end(), key2.begin(), key2.end());
-    key.insert(key.end(), key3.begin(), key3.end());
-
-    for (int j = 0; j < 10000; j++) {
-      prev_prev_result = prev_result;
-      prev_result = result;
-      const EVP_CIPHER *cipher = ctx->cipher;
-      if (!CipherOperation(cipher, &result, operation == kEncrypt, key,
-                           current_iv, current_in)) {
-        t->PrintLine("CipherOperation failed");
-        return false;
-      }
-      if (ctx->has_iv) {
-        if (operation == kEncrypt) {
-          if (j == 0) {
-            current_in = current_iv;
-          } else {
-            current_in = prev_result;
-          }
-          current_iv = result;
-        } else {  // operation == kDecrypt
-          current_iv = current_in;
-          current_in = result;
-        }
-      } else {
-        current_in = result;
-      }
-    }
-
-    // Output result for COUNT = i.
-    const std::string result_label =
-        operation == kEncrypt ? "CIPHERTEXT" : "PLAINTEXT";
-    if (i == 0) {
-      const std::string op_label =
-          operation == kEncrypt ? "ENCRYPT" : "DECRYPT";
-      printf("[%s]\n\n", op_label.c_str());
-    }
-    printf("COUNT = %d\r\nKEY1 = %s\r\nKEY2 = %s\r\nKEY3 = %s\r\n", i,
-           EncodeHex(key1).c_str(), EncodeHex(key2).c_str(),
-           EncodeHex(key3).c_str());
-    if (ctx->has_iv) {
-      printf("IV = %s\r\n", EncodeHex(iv).c_str());
-    }
-    printf("%s = %s\r\n", in_label.c_str(), EncodeHex(in).c_str());
-    printf("%s = %s\r\n\r\n", result_label.c_str(), EncodeHex(result).c_str());
-
-
-    XORKeyWithOddParityLSB(&key1, result);
-    XORKeyWithOddParityLSB(&key2, prev_result);
-    if (num_keys == kThree) {
-      XORKeyWithOddParityLSB(&key3, prev_prev_result);
-    } else {
-      XORKeyWithOddParityLSB(&key3, result);
-    }
-
-    if (ctx->has_iv) {
-      if (operation == kEncrypt) {
-        in = prev_result;
-        iv = result;
-      } else {
-        iv = current_iv;
-        in = current_in;
-      }
-    } else {
-      in = result;
-    }
-  }
-
-  return true;
-}
-
-static int usage(char *arg) {
-  fprintf(stderr, "usage: %s (kat|mct) <cipher> <test file>\n", arg);
-  return 1;
-}
-
-int cavp_tdes_test_main(int argc, char **argv) {
-  if (argc != 4) {
-    return usage(argv[0]);
-  }
-
-  const std::string tm(argv[1]);
-  enum TestCtx::Mode test_mode;
-  if (tm == "kat") {
-    test_mode = TestCtx::kKAT;
-  } else if (tm == "mct") {
-    test_mode = TestCtx::kMCT;
-  } else {
-    fprintf(stderr, "invalid test_mode: %s\n", tm.c_str());
-    return usage(argv[0]);
-  }
-
-  const std::string cipher_name(argv[2]);
-  const EVP_CIPHER *cipher = GetCipher(argv[2]);
-  if (cipher == nullptr) {
-    fprintf(stderr, "invalid cipher: %s\n", argv[2]);
-    return 1;
-  }
-  bool has_iv = cipher_name != "des-ede" && cipher_name != "des-ede3";
-  TestCtx ctx = {cipher, has_iv, test_mode};
-
-  FileTestFunc test_fn = test_mode == TestCtx::kKAT ? &TestKAT : &TestMCT;
-  FileTest::Options opts;
-  opts.path = argv[3];
-  opts.callback = test_fn;
-  opts.arg = &ctx;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/cavp_test_util.cc b/src/util/fipstools/cavp/cavp_test_util.cc
deleted file mode 100644
index 1b4e3a1..0000000
--- a/src/util/fipstools/cavp/cavp_test_util.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#include "cavp_test_util.h"
-
-#include <openssl/bn.h>
-#include <openssl/digest.h>
-#include <openssl/ec.h>
-#include <openssl/nid.h>
-
-
-const EVP_CIPHER *GetCipher(const std::string &name) {
-  if (name == "des-cbc") {
-    return EVP_des_cbc();
-  } else if (name == "des-ecb") {
-    return EVP_des_ecb();
-  } else if (name == "des-ede") {
-    return EVP_des_ede();
-  } else if (name == "des-ede3") {
-    return EVP_des_ede3();
-  } else if (name == "des-ede-cbc") {
-    return EVP_des_ede_cbc();
-  } else if (name == "des-ede3-cbc") {
-    return EVP_des_ede3_cbc();
-  } else if (name == "rc4") {
-    return EVP_rc4();
-  } else if (name == "aes-128-ecb") {
-    return EVP_aes_128_ecb();
-  } else if (name == "aes-256-ecb") {
-    return EVP_aes_256_ecb();
-  } else if (name == "aes-128-cbc") {
-    return EVP_aes_128_cbc();
-  } else if (name == "aes-128-gcm") {
-    return EVP_aes_128_gcm();
-  } else if (name == "aes-128-ofb") {
-    return EVP_aes_128_ofb();
-  } else if (name == "aes-192-cbc") {
-    return EVP_aes_192_cbc();
-  } else if (name == "aes-192-ctr") {
-    return EVP_aes_192_ctr();
-  } else if (name == "aes-192-ecb") {
-    return EVP_aes_192_ecb();
-  } else if (name == "aes-256-cbc") {
-    return EVP_aes_256_cbc();
-  } else if (name == "aes-128-ctr") {
-    return EVP_aes_128_ctr();
-  } else if (name == "aes-256-ctr") {
-    return EVP_aes_256_ctr();
-  } else if (name == "aes-256-gcm") {
-    return EVP_aes_256_gcm();
-  } else if (name == "aes-256-ofb") {
-    return EVP_aes_256_ofb();
-  }
-  return nullptr;
-}
-
-bool CipherOperation(const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
-                     bool encrypt, const std::vector<uint8_t> &key,
-                     const std::vector<uint8_t> &iv,
-                     const std::vector<uint8_t> &in) {
-  bssl::ScopedEVP_CIPHER_CTX ctx;
-  if (!EVP_CipherInit_ex(ctx.get(), cipher, nullptr, nullptr, nullptr,
-                         encrypt ? 1 : 0)) {
-    return false;
-  }
-  if (!iv.empty() && iv.size() != EVP_CIPHER_CTX_iv_length(ctx.get())) {
-    return false;
-  }
-
-  int result_len1 = 0, result_len2;
-  *out = std::vector<uint8_t>(in.size());
-  if (!EVP_CIPHER_CTX_set_key_length(ctx.get(), key.size()) ||
-      !EVP_CipherInit_ex(ctx.get(), nullptr, nullptr, key.data(), iv.data(),
-                         -1) ||
-      !EVP_CIPHER_CTX_set_padding(ctx.get(), 0) ||
-      !EVP_CipherUpdate(ctx.get(), out->data(), &result_len1, in.data(),
-                        in.size()) ||
-      !EVP_CipherFinal_ex(ctx.get(), out->data() + result_len1, &result_len2)) {
-    return false;
-  }
-  out->resize(result_len1 + result_len2);
-
-  return true;
-}
-
-bool AEADEncrypt(const EVP_AEAD *aead, std::vector<uint8_t> *ct,
-                 std::vector<uint8_t> *tag, size_t tag_len,
-                 const std::vector<uint8_t> &key,
-                 const std::vector<uint8_t> &pt,
-                 const std::vector<uint8_t> &aad,
-                 const std::vector<uint8_t> &iv) {
-  bssl::ScopedEVP_AEAD_CTX ctx;
-  if (!EVP_AEAD_CTX_init(ctx.get(), aead, key.data(), key.size(), tag_len,
-                         nullptr)) {
-    return false;
-  }
-
-  std::vector<uint8_t> out;
-  out.resize(pt.size() + EVP_AEAD_max_overhead(aead));
-  size_t out_len;
-  if (!EVP_AEAD_CTX_seal(ctx.get(), out.data(), &out_len, out.size(), iv.data(),
-                         iv.size(), pt.data(), pt.size(), aad.data(),
-                         aad.size())) {
-    return false;
-  }
-  out.resize(out_len);
-
-  ct->assign(out.begin(), out.end() - tag_len);
-  tag->assign(out.end() - tag_len, out.end());
-
-  return true;
-}
-
-bool AEADDecrypt(const EVP_AEAD *aead, std::vector<uint8_t> *pt, size_t pt_len,
-                 const std::vector<uint8_t> &key,
-                 const std::vector<uint8_t> &aad,
-                 const std::vector<uint8_t> &ct,
-                 const std::vector<uint8_t> &tag,
-                 const std::vector<uint8_t> &iv) {
-  bssl::ScopedEVP_AEAD_CTX ctx;
-  if (!EVP_AEAD_CTX_init_with_direction(ctx.get(), aead, key.data(), key.size(),
-                                        tag.size(), evp_aead_open)) {
-    return false;
-  }
-  std::vector<uint8_t> in = ct;
-  in.reserve(ct.size() + tag.size());
-  in.insert(in.end(), tag.begin(), tag.end());
-
-  pt->resize(pt_len);
-  size_t out_pt_len;
-  if (!EVP_AEAD_CTX_open(ctx.get(), pt->data(), &out_pt_len, pt->size(),
-                         iv.data(), iv.size(), in.data(), in.size(), aad.data(),
-                         aad.size()) ||
-      out_pt_len != pt_len) {
-    return false;
-  }
-  return true;
-}
-
-static int HexToBIGNUM(bssl::UniquePtr<BIGNUM> *out, const char *in) {
-  BIGNUM *raw = NULL;
-  int ret = BN_hex2bn(&raw, in);
-  out->reset(raw);
-  return ret;
-}
-
-bssl::UniquePtr<BIGNUM> GetBIGNUM(FileTest *t, const char *attribute) {
-  std::string hex;
-  if (!t->GetAttribute(&hex, attribute)) {
-    return nullptr;
-  }
-
-  bssl::UniquePtr<BIGNUM> ret;
-  if (HexToBIGNUM(&ret, hex.c_str()) != static_cast<int>(hex.size())) {
-    t->PrintLine("Could not decode '%s'.", hex.c_str());
-    return nullptr;
-  }
-  return ret;
-}
-
-int GetECGroupNIDFromInstruction(FileTest *t, const char **out_str) {
-  const char *dummy;
-  if (out_str == nullptr) {
-    out_str = &dummy;
-  }
-
-  if (t->HasInstruction("P-224")) {
-    *out_str = "P-224";
-    return NID_secp224r1;
-  }
-  if (t->HasInstruction("P-256")) {
-    *out_str = "P-256";
-    return NID_X9_62_prime256v1;
-  }
-  if (t->HasInstruction("P-384")) {
-    *out_str = "P-384";
-    return NID_secp384r1;
-  }
-  if (t->HasInstruction("P-521")) {
-    *out_str = "P-521";
-    return NID_secp521r1;
-  }
-  t->PrintLine("No supported group specified.");
-  return NID_undef;
-}
-
-const EVP_MD *GetDigestFromInstruction(FileTest *t) {
-  if (t->HasInstruction("SHA-1")) {
-    return EVP_sha1();
-  }
-  if (t->HasInstruction("SHA-224")) {
-    return EVP_sha224();
-  }
-  if (t->HasInstruction("SHA-256")) {
-    return EVP_sha256();
-  }
-  if (t->HasInstruction("SHA-384")) {
-    return EVP_sha384();
-  }
-  if (t->HasInstruction("SHA-512")) {
-    return EVP_sha512();
-  }
-  t->PrintLine("No supported digest function specified.");
-  return nullptr;
-}
-
-void EchoComment(const std::string& comment) {
-  fwrite(comment.c_str(), comment.size(), 1, stdout);
-}
diff --git a/src/util/fipstools/cavp/cavp_test_util.h b/src/util/fipstools/cavp/cavp_test_util.h
deleted file mode 100644
index d51dfe6..0000000
--- a/src/util/fipstools/cavp/cavp_test_util.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2017, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
-#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
-
-#include <stdlib.h>
-#include <string>
-#include <vector>
-
-#include <openssl/aead.h>
-#include <openssl/cipher.h>
-
-#include "../crypto/test/file_test.h"
-
-
-const EVP_CIPHER *GetCipher(const std::string &name);
-
-bool CipherOperation(const EVP_CIPHER *cipher, std::vector<uint8_t> *out,
-                     bool encrypt, const std::vector<uint8_t> &key,
-                     const std::vector<uint8_t> &iv,
-                     const std::vector<uint8_t> &in);
-
-bool AEADEncrypt(const EVP_AEAD *aead, std::vector<uint8_t> *ct,
-                 std::vector<uint8_t> *tag, size_t tag_len,
-                 const std::vector<uint8_t> &key,
-                 const std::vector<uint8_t> &pt,
-                 const std::vector<uint8_t> &aad,
-                 const std::vector<uint8_t> &iv);
-
-bool AEADDecrypt(const EVP_AEAD *aead, std::vector<uint8_t> *pt, size_t pt_len,
-                 const std::vector<uint8_t> &key,
-                 const std::vector<uint8_t> &aad,
-                 const std::vector<uint8_t> &ct,
-                 const std::vector<uint8_t> &tag,
-                 const std::vector<uint8_t> &iv);
-
-bssl::UniquePtr<BIGNUM> GetBIGNUM(FileTest *t, const char *attribute);
-
-int GetECGroupNIDFromInstruction(FileTest *t, const char **out_str = nullptr);
-
-const EVP_MD *GetDigestFromInstruction(FileTest *t);
-
-void EchoComment(const std::string& comment);
-
-int cavp_aes_gcm_test_main(int argc, char **argv);
-int cavp_aes_test_main(int argc, char **argv);
-int cavp_ctr_drbg_test_main(int argc, char **argv);
-int cavp_ecdsa2_keypair_test_main(int argc, char **argv);
-int cavp_ecdsa2_pkv_test_main(int argc, char **argv);
-int cavp_ecdsa2_siggen_test_main(int argc, char **argv);
-int cavp_ecdsa2_sigver_test_main(int argc, char **argv);
-int cavp_hmac_test_main(int argc, char **argv);
-int cavp_kas_test_main(int argc, char **argv);
-int cavp_keywrap_test_main(int argc, char **argv);
-int cavp_rsa2_keygen_test_main(int argc, char **argv);
-int cavp_rsa2_siggen_test_main(int argc, char **argv);
-int cavp_rsa2_sigver_test_main(int argc, char **argv);
-int cavp_sha_monte_test_main(int argc, char **argv);
-int cavp_sha_test_main(int argc, char **argv);
-int cavp_tdes_test_main(int argc, char **argv);
-int cavp_tlskdf_test_main(int argc, char **argv);
-
-
-#endif  // OPENSSL_HEADER_CRYPTO_FIPSMODULE_CAVP_TEST_UTIL_H
diff --git a/src/util/fipstools/cavp/cavp_tlskdf_test.cc b/src/util/fipstools/cavp/cavp_tlskdf_test.cc
deleted file mode 100644
index 0243439..0000000
--- a/src/util/fipstools/cavp/cavp_tlskdf_test.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018, Google Inc.
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
-// cavp_tlskdf_test processes NIST TLS KDF test vectors and emits the
-// corresponding response.
-// See https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/components/askdfvs.pdf, section 6.4.
-
-#include <vector>
-
-#include <errno.h>
-
-#include <openssl/digest.h>
-
-#include "cavp_test_util.h"
-#include "../crypto/fipsmodule/tls/internal.h"
-#include "../crypto/test/file_test.h"
-#include "../crypto/test/test_util.h"
-
-
-static bool TestTLSKDF(FileTest *t, void *arg) {
-  const EVP_MD *md = nullptr;
-
-  if (t->HasInstruction("TLS 1.0/1.1")) {
-    md = EVP_md5_sha1();
-  } else if (t->HasInstruction("TLS 1.2")) {
-    if (t->HasInstruction("SHA-256")) {
-      md = EVP_sha256();
-    } else if (t->HasInstruction("SHA-384")) {
-      md = EVP_sha384();
-    } else if (t->HasInstruction("SHA-512")) {
-      md = EVP_sha512();
-    }
-  }
-
-  if (md == nullptr) {
-    return false;
-  }
-
-  std::string key_block_len_str;
-  std::vector<uint8_t> premaster, server_random, client_random,
-      key_block_server_random, key_block_client_random;
-  if (!t->GetBytes(&premaster, "pre_master_secret") ||
-      !t->GetBytes(&server_random, "serverHello_random") ||
-      !t->GetBytes(&client_random, "clientHello_random") ||
-      // The NIST tests specify different client and server randoms for the
-      // expansion step from the master-secret step. This is impossible in TLS.
-      !t->GetBytes(&key_block_server_random, "server_random") ||
-      !t->GetBytes(&key_block_client_random, "client_random") ||
-      !t->GetInstruction(&key_block_len_str, "key block length") ||
-      // These are ignored.
-      !t->HasAttribute("COUNT") ||
-      !t->HasInstruction("pre-master secret length")) {
-    return false;
-  }
-
-  uint8_t master_secret[48];
-  static const char kMasterSecretLabel[] = "master secret";
-  if (!CRYPTO_tls1_prf(md, master_secret, sizeof(master_secret),
-                       premaster.data(), premaster.size(), kMasterSecretLabel,
-                       sizeof(kMasterSecretLabel) - 1, client_random.data(),
-                       client_random.size(), server_random.data(),
-                       server_random.size())) {
-    return false;
-  }
-
-  errno = 0;
-  const long int key_block_bits =
-      strtol(key_block_len_str.c_str(), nullptr, 10);
-  if (errno != 0 || key_block_bits <= 0 || (key_block_bits & 7) != 0) {
-    return false;
-  }
-  const size_t key_block_len = key_block_bits / 8;
-  std::vector<uint8_t> key_block(key_block_len);
-  static const char kLabel[] = "key expansion";
-  if (!CRYPTO_tls1_prf(
-          md, key_block.data(), key_block.size(), master_secret,
-          sizeof(master_secret), kLabel, sizeof(kLabel) - 1,
-          key_block_server_random.data(), key_block_server_random.size(),
-          key_block_client_random.data(), key_block_client_random.size())) {
-    return false;
-  }
-
-  printf("%smaster_secret = %s\r\nkey_block = %s\r\n\r\n",
-         t->CurrentTestToString().c_str(), EncodeHex(master_secret).c_str(),
-         EncodeHex(key_block).c_str());
-
-  return true;
-}
-
-int cavp_tlskdf_test_main(int argc, char **argv) {
-  if (argc != 2) {
-    fprintf(stderr, "usage: %s <test file>\n", argv[0]);
-    return 1;
-  }
-
-  FileTest::Options opts;
-  opts.path = argv[1];
-  opts.callback = TestTLSKDF;
-  opts.silent = true;
-  opts.comment_callback = EchoComment;
-  return FileTestMain(opts);
-}
diff --git a/src/util/fipstools/cavp/run_cavp.go b/src/util/fipstools/cavp/run_cavp.go
deleted file mode 100644
index 51a4100..0000000
--- a/src/util/fipstools/cavp/run_cavp.go
+++ /dev/null
@@ -1,592 +0,0 @@
-// run_cavp.go processes CAVP input files and generates suitable response
-// files, optionally comparing the results against the provided FAX files.
-package main
-
-import (
-	"bufio"
-	"errors"
-	"flag"
-	"fmt"
-	"os"
-	"os/exec"
-	"path"
-	"path/filepath"
-	"runtime"
-	"strings"
-	"sync"
-	"time"
-)
-
-var (
-	oraclePath = flag.String("oracle-bin", "", "Path to the oracle binary")
-	suiteDir   = flag.String("suite-dir", "", "Base directory containing the CAVP test suite")
-	noFAX      = flag.Bool("no-fax", false, "Skip comparing against FAX files")
-	android    = flag.Bool("android", false, "Run tests via ADB")
-)
-
-const (
-	androidTmpPath       = "/data/local/tmp/"
-	androidCAVPPath      = androidTmpPath + "cavp"
-	androidLibCryptoPath = androidTmpPath + "libcrypto.so"
-)
-
-// test describes a single request file.
-type test struct {
-	// inFile is the base of the filename without an extension, i.e.
-	// “ECBMCT128”.
-	inFile string
-	// args are the arguments (not including the input filename) to the
-	// oracle binary.
-	args []string
-	// noFAX, if true, indicates that the output cannot be compared against
-	// the FAX file. (E.g. because the primitive is non-deterministic.)
-	noFAX bool
-}
-
-// nextLineState can be used by FAX next-line function to store state.
-type nextLineState struct {
-	// State used by the KAS test.
-	nextIsIUTHash bool
-}
-
-// testSuite describes a series of tests that are handled by a single oracle
-// binary.
-type testSuite struct {
-	// directory is the name of the directory in the CAVP input, i.e. “AES”.
-	directory string
-	// suite names the test suite to pass as the first command-line argument.
-	suite string
-	// nextLineFunc, if not nil, is the function used to read the next line
-	// from the FAX file. This can be used to skip lines and/or mutate them
-	// as needed. The second argument can be used by the scanner to store
-	// state, if needed. If isWildcard is true on return then line is not
-	// meaningful and any line from the response file should be accepted.
-	nextLineFunc func(*bufio.Scanner, *nextLineState) (line string, isWildcard, ok bool)
-	tests        []test
-}
-
-func (t *testSuite) getDirectory() string {
-	return filepath.Join(*suiteDir, t.directory)
-}
-
-var aesGCMTests = testSuite{
-	"AES_GCM",
-	"aes_gcm",
-	nil,
-	[]test{
-		{"gcmDecrypt128", []string{"dec", "aes-128-gcm"}, false},
-		{"gcmDecrypt192", []string{"dec", "aes-192-gcm"}, false},
-		{"gcmDecrypt256", []string{"dec", "aes-256-gcm"}, false},
-		{"gcmEncryptExtIV128", []string{"enc", "aes-128-gcm"}, false},
-		{"gcmEncryptExtIV192", []string{"enc", "aes-192-gcm"}, false},
-		{"gcmEncryptExtIV256", []string{"enc", "aes-256-gcm"}, false},
-	},
-}
-
-var aesTests = testSuite{
-	"AES",
-	"aes",
-	nil,
-	[]test{
-		{"CBCGFSbox128", []string{"kat", "aes-128-cbc"}, false},
-		{"CBCGFSbox192", []string{"kat", "aes-192-cbc"}, false},
-		{"CBCGFSbox256", []string{"kat", "aes-256-cbc"}, false},
-		{"CBCKeySbox128", []string{"kat", "aes-128-cbc"}, false},
-		{"CBCKeySbox192", []string{"kat", "aes-192-cbc"}, false},
-		{"CBCKeySbox256", []string{"kat", "aes-256-cbc"}, false},
-		{"CBCMMT128", []string{"kat", "aes-128-cbc"}, false},
-		{"CBCMMT192", []string{"kat", "aes-192-cbc"}, false},
-		{"CBCMMT256", []string{"kat", "aes-256-cbc"}, false},
-		{"CBCVarKey128", []string{"kat", "aes-128-cbc"}, false},
-		{"CBCVarKey192", []string{"kat", "aes-192-cbc"}, false},
-		{"CBCVarKey256", []string{"kat", "aes-256-cbc"}, false},
-		{"CBCVarTxt128", []string{"kat", "aes-128-cbc"}, false},
-		{"CBCVarTxt192", []string{"kat", "aes-192-cbc"}, false},
-		{"CBCVarTxt256", []string{"kat", "aes-256-cbc"}, false},
-		{"ECBGFSbox128", []string{"kat", "aes-128-ecb"}, false},
-		{"ECBGFSbox192", []string{"kat", "aes-192-ecb"}, false},
-		{"ECBGFSbox256", []string{"kat", "aes-256-ecb"}, false},
-		{"ECBKeySbox128", []string{"kat", "aes-128-ecb"}, false},
-		{"ECBKeySbox192", []string{"kat", "aes-192-ecb"}, false},
-		{"ECBKeySbox256", []string{"kat", "aes-256-ecb"}, false},
-		{"ECBMMT128", []string{"kat", "aes-128-ecb"}, false},
-		{"ECBMMT192", []string{"kat", "aes-192-ecb"}, false},
-		{"ECBMMT256", []string{"kat", "aes-256-ecb"}, false},
-		{"ECBVarKey128", []string{"kat", "aes-128-ecb"}, false},
-		{"ECBVarKey192", []string{"kat", "aes-192-ecb"}, false},
-		{"ECBVarKey256", []string{"kat", "aes-256-ecb"}, false},
-		{"ECBVarTxt128", []string{"kat", "aes-128-ecb"}, false},
-		{"ECBVarTxt192", []string{"kat", "aes-192-ecb"}, false},
-		{"ECBVarTxt256", []string{"kat", "aes-256-ecb"}, false},
-		// AES Monte-Carlo tests
-		{"ECBMCT128", []string{"mct", "aes-128-ecb"}, false},
-		{"ECBMCT192", []string{"mct", "aes-192-ecb"}, false},
-		{"ECBMCT256", []string{"mct", "aes-256-ecb"}, false},
-		{"CBCMCT128", []string{"mct", "aes-128-cbc"}, false},
-		{"CBCMCT192", []string{"mct", "aes-192-cbc"}, false},
-		{"CBCMCT256", []string{"mct", "aes-256-cbc"}, false},
-	},
-}
-
-var ecdsa2KeyPairTests = testSuite{
-	"ECDSA2",
-	"ecdsa2_keypair",
-	nil,
-	[]test{{"KeyPair", nil, true}},
-}
-
-var ecdsa2PKVTests = testSuite{
-	"ECDSA2",
-	"ecdsa2_pkv",
-	nil,
-	[]test{{"PKV", nil, false}},
-}
-
-var ecdsa2SigGenTests = testSuite{
-	"ECDSA2",
-	"ecdsa2_siggen",
-	nil,
-	[]test{
-		{"SigGen", []string{"SigGen"}, true},
-		{"SigGenComponent", []string{"SigGenComponent"}, true},
-	},
-}
-
-var ecdsa2SigVerTests = testSuite{
-	"ECDSA2",
-	"ecdsa2_sigver",
-	nil,
-	[]test{{"SigVer", nil, false}},
-}
-
-var rsa2KeyGenTests = testSuite{
-	"RSA2",
-	"rsa2_keygen",
-	nil,
-	[]test{
-		{"KeyGen_RandomProbablyPrime3_3", nil, true},
-	},
-}
-
-var rsa2SigGenTests = testSuite{
-	"RSA2",
-	"rsa2_siggen",
-	nil,
-	[]test{
-		{"SigGen15_186-3", []string{"pkcs15"}, true},
-		{"SigGenPSS_186-3", []string{"pss"}, true},
-	},
-}
-
-var rsa2SigVerTests = testSuite{
-	"RSA2",
-	"rsa2_sigver",
-	func(s *bufio.Scanner, state *nextLineState) (string, bool, bool) {
-		for {
-			if !s.Scan() {
-				return "", false, false
-			}
-
-			line := s.Text()
-			if strings.HasPrefix(line, "p = ") || strings.HasPrefix(line, "d = ") || strings.HasPrefix(line, "SaltVal = ") || strings.HasPrefix(line, "EM with ") {
-				continue
-			}
-			if strings.HasPrefix(line, "q = ") {
-				// Skip the "q = " line and an additional blank line.
-				if !s.Scan() ||
-					len(strings.TrimSpace(s.Text())) > 0 {
-					return "", false, false
-				}
-				continue
-			}
-			return line, false, true
-		}
-	},
-	[]test{
-		{"SigVer15_186-3", []string{"pkcs15"}, false},
-		{"SigVerPSS_186-3", []string{"pss"}, false},
-	},
-}
-
-var hmacTests = testSuite{
-	"HMAC",
-	"hmac",
-	nil,
-	[]test{{"HMAC", nil, false}},
-}
-
-var shaTests = testSuite{
-	"SHA",
-	"sha",
-	nil,
-	[]test{
-		{"SHA1LongMsg", []string{"SHA1"}, false},
-		{"SHA1ShortMsg", []string{"SHA1"}, false},
-		{"SHA224LongMsg", []string{"SHA224"}, false},
-		{"SHA224ShortMsg", []string{"SHA224"}, false},
-		{"SHA256LongMsg", []string{"SHA256"}, false},
-		{"SHA256ShortMsg", []string{"SHA256"}, false},
-		{"SHA384LongMsg", []string{"SHA384"}, false},
-		{"SHA384ShortMsg", []string{"SHA384"}, false},
-		{"SHA512LongMsg", []string{"SHA512"}, false},
-		{"SHA512ShortMsg", []string{"SHA512"}, false},
-	},
-}
-
-var shaMonteTests = testSuite{
-	"SHA",
-	"sha_monte",
-	nil,
-	[]test{
-		{"SHA1Monte", []string{"SHA1"}, false},
-		{"SHA224Monte", []string{"SHA224"}, false},
-		{"SHA256Monte", []string{"SHA256"}, false},
-		{"SHA384Monte", []string{"SHA384"}, false},
-		{"SHA512Monte", []string{"SHA512"}, false},
-	},
-}
-
-var ctrDRBGTests = testSuite{
-	"DRBG800-90A",
-	"ctr_drbg",
-	nil,
-	[]test{{"CTR_DRBG", nil, false}},
-}
-
-var tdesTests = testSuite{
-	"TDES",
-	"tdes",
-	nil,
-	[]test{
-		{"TCBCMMT2", []string{"kat", "des-ede-cbc"}, false},
-		{"TCBCMMT3", []string{"kat", "des-ede3-cbc"}, false},
-		{"TCBCMonte2", []string{"mct", "des-ede3-cbc"}, false},
-		{"TCBCMonte3", []string{"mct", "des-ede3-cbc"}, false},
-		{"TCBCinvperm", []string{"kat", "des-ede3-cbc"}, false},
-		{"TCBCpermop", []string{"kat", "des-ede3-cbc"}, false},
-		{"TCBCsubtab", []string{"kat", "des-ede3-cbc"}, false},
-		{"TCBCvarkey", []string{"kat", "des-ede3-cbc"}, false},
-		{"TCBCvartext", []string{"kat", "des-ede3-cbc"}, false},
-		{"TECBMMT2", []string{"kat", "des-ede"}, false},
-		{"TECBMMT3", []string{"kat", "des-ede3"}, false},
-		{"TECBMonte2", []string{"mct", "des-ede3"}, false},
-		{"TECBMonte3", []string{"mct", "des-ede3"}, false},
-		{"TECBinvperm", []string{"kat", "des-ede3"}, false},
-		{"TECBpermop", []string{"kat", "des-ede3"}, false},
-		{"TECBsubtab", []string{"kat", "des-ede3"}, false},
-		{"TECBvarkey", []string{"kat", "des-ede3"}, false},
-		{"TECBvartext", []string{"kat", "des-ede3"}, false},
-	},
-}
-
-var keyWrapTests = testSuite{
-	"KeyWrap38F",
-	"keywrap",
-	nil,
-	[]test{
-		{"KW_AD_128", []string{"dec", "128"}, false},
-		{"KW_AD_192", []string{"dec", "192"}, false},
-		{"KW_AD_256", []string{"dec", "256"}, false},
-		{"KW_AE_128", []string{"enc", "128"}, false},
-		{"KW_AE_192", []string{"enc", "192"}, false},
-		{"KW_AE_256", []string{"enc", "256"}, false},
-		{"KWP_AD_128", []string{"dec-pad", "128"}, false},
-		{"KWP_AD_192", []string{"dec-pad", "192"}, false},
-		{"KWP_AD_256", []string{"dec-pad", "256"}, false},
-		{"KWP_AE_128", []string{"enc-pad", "128"}, false},
-		{"KWP_AE_192", []string{"enc-pad", "192"}, false},
-		{"KWP_AE_256", []string{"enc-pad", "256"}, false},
-	},
-}
-
-var kasTests = testSuite{
-	"KAS",
-	"kas",
-	func(s *bufio.Scanner, state *nextLineState) (line string, isWildcard, ok bool) {
-		for {
-			// If the response file will include the IUT hash next,
-			// return a wildcard signal because this cannot be
-			// matched against the FAX file.
-			if state.nextIsIUTHash {
-				state.nextIsIUTHash = false
-				return "", true, true
-			}
-
-			if !s.Scan() {
-				return "", false, false
-			}
-
-			line := s.Text()
-			if strings.HasPrefix(line, "deCAVS = ") || strings.HasPrefix(line, "Z = ") {
-				continue
-			}
-			if strings.HasPrefix(line, "CAVSHashZZ = ") {
-				state.nextIsIUTHash = true
-			}
-			return line, false, true
-		}
-	},
-	[]test{
-		{"KASFunctionTest_ECCEphemeralUnified_NOKC_ZZOnly_init", []string{"function"}, true},
-		{"KASFunctionTest_ECCEphemeralUnified_NOKC_ZZOnly_resp", []string{"function"}, true},
-		{"KASValidityTest_ECCEphemeralUnified_NOKC_ZZOnly_init", []string{"validity"}, false},
-		{"KASValidityTest_ECCEphemeralUnified_NOKC_ZZOnly_resp", []string{"validity"}, false},
-	},
-}
-
-var tlsKDFTests = testSuite{
-	"KDF135",
-	"tlskdf",
-	nil,
-	[]test{
-		{"tls", nil, false},
-	},
-}
-
-var testSuites = []*testSuite{
-	&aesGCMTests,
-	&aesTests,
-	&ctrDRBGTests,
-	&ecdsa2KeyPairTests,
-	&ecdsa2PKVTests,
-	&ecdsa2SigGenTests,
-	&ecdsa2SigVerTests,
-	&hmacTests,
-	&keyWrapTests,
-	&rsa2KeyGenTests,
-	&rsa2SigGenTests,
-	&rsa2SigVerTests,
-	&shaTests,
-	&shaMonteTests,
-	&tdesTests,
-	&kasTests,
-	&tlsKDFTests,
-}
-
-// testInstance represents a specific test in a testSuite.
-type testInstance struct {
-	suite     *testSuite
-	testIndex int
-}
-
-func worker(wg *sync.WaitGroup, work <-chan testInstance) {
-	defer wg.Done()
-
-	for ti := range work {
-		test := ti.suite.tests[ti.testIndex]
-
-		if err := doTest(ti.suite, test); err != nil {
-			fmt.Fprintf(os.Stderr, "%s\n", err)
-			os.Exit(2)
-		}
-
-		if !*noFAX && !test.noFAX {
-			if err := compareFAX(ti.suite, test); err != nil {
-				fmt.Fprintf(os.Stderr, "%s\n", err)
-				os.Exit(3)
-			}
-		}
-	}
-}
-
-func checkAndroidPrereqs() error {
-	// The cavp binary, and a matching libcrypto.so, are required to be placed
-	// in /data/local/tmp before running this script.
-	if err := exec.Command("adb", "shell", "ls", androidCAVPPath).Run(); err != nil {
-		return errors.New("failed to list cavp binary; ensure that adb works and cavp binary is in place: " + err.Error())
-	}
-	if err := exec.Command("adb", "shell", "ls", androidLibCryptoPath).Run(); err != nil {
-		return errors.New("failed to list libcrypto.so; ensure that library is in place: " + err.Error())
-	}
-	return nil
-}
-
-func main() {
-	flag.Parse()
-
-	if *android {
-		if err := checkAndroidPrereqs(); err != nil {
-			fmt.Fprintf(os.Stderr, "%s\n", err)
-			os.Exit(1)
-		}
-	} else if len(*oraclePath) == 0 {
-		fmt.Fprintf(os.Stderr, "Must give -oracle-bin\n")
-		os.Exit(1)
-	}
-
-	work := make(chan testInstance)
-	var wg sync.WaitGroup
-
-	numWorkers := runtime.NumCPU()
-	if *android {
-		numWorkers = 1
-	}
-
-	for i := 0; i < numWorkers; i++ {
-		wg.Add(1)
-		go worker(&wg, work)
-	}
-
-	for _, suite := range testSuites {
-		for i := range suite.tests {
-			work <- testInstance{suite, i}
-		}
-	}
-
-	close(work)
-	wg.Wait()
-}
-
-func doTest(suite *testSuite, test test) error {
-	bin := *oraclePath
-	var args []string
-
-	if *android {
-		bin = "adb"
-		args = []string{"shell", "LD_LIBRARY_PATH=" + androidTmpPath, androidCAVPPath}
-	}
-
-	args = append(args, suite.suite)
-	args = append(args, test.args...)
-	reqPath := filepath.Join(suite.getDirectory(), "req", test.inFile+".req")
-	var reqPathOnDevice string
-
-	if *android {
-		reqPathOnDevice = path.Join(androidTmpPath, test.inFile+".req")
-		if err := exec.Command("adb", "push", reqPath, reqPathOnDevice).Run(); err != nil {
-			return errors.New("failed to push request file: " + err.Error())
-		}
-		args = append(args, reqPathOnDevice)
-	} else {
-		args = append(args, reqPath)
-	}
-
-	respDir := filepath.Join(suite.getDirectory(), "resp")
-	if err := os.Mkdir(respDir, 0755); err != nil && !os.IsExist(err) {
-		return fmt.Errorf("cannot create resp directory: %s", err)
-	}
-	outPath := filepath.Join(respDir, test.inFile+".rsp")
-	outFile, err := os.OpenFile(outPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
-	if err != nil {
-		return fmt.Errorf("cannot open output file for %q %q: %s", suite.getDirectory(), test.inFile, err)
-	}
-	defer outFile.Close()
-
-	cmd := exec.Command(bin, args...)
-	cmd.Stdout = outFile
-	cmd.Stderr = os.Stderr
-
-	cmdLine := strings.Join(append([]string{bin}, args...), " ")
-	startTime := time.Now()
-	if err := cmd.Run(); err != nil {
-		return fmt.Errorf("cannot run command for %q %q (%s): %s", suite.getDirectory(), test.inFile, cmdLine, err)
-	}
-
-	fmt.Printf("%s (%ds)\n", cmdLine, int(time.Since(startTime).Seconds()))
-
-	if *android {
-		exec.Command("adb", "shell", "rm", reqPathOnDevice).Run()
-	}
-
-	return nil
-}
-
-func canonicalizeLine(in string) string {
-	if strings.HasPrefix(in, "Result = P (") {
-		return "Result = P"
-	}
-	if strings.HasPrefix(in, "Result = F (") {
-		return "Result = F"
-	}
-	return in
-}
-
-func compareFAX(suite *testSuite, test test) error {
-	nextLineFunc := suite.nextLineFunc
-	if nextLineFunc == nil {
-		nextLineFunc = func(s *bufio.Scanner, state *nextLineState) (string, bool, bool) {
-			if !s.Scan() {
-				return "", false, false
-			}
-			return s.Text(), false, true
-		}
-	}
-
-	respPath := filepath.Join(suite.getDirectory(), "resp", test.inFile+".rsp")
-	respFile, err := os.Open(respPath)
-	if err != nil {
-		return fmt.Errorf("cannot read output of %q %q: %s", suite.getDirectory(), test.inFile, err)
-	}
-	defer respFile.Close()
-
-	faxPath := filepath.Join(suite.getDirectory(), "fax", test.inFile+".fax")
-	faxFile, err := os.Open(faxPath)
-	if err != nil {
-		return fmt.Errorf("cannot open fax file for %q %q: %s", suite.getDirectory(), test.inFile, err)
-	}
-	defer faxFile.Close()
-
-	respScanner := bufio.NewScanner(respFile)
-	faxScanner := bufio.NewScanner(faxFile)
-	var nextLineState nextLineState
-
-	lineNo := 0
-	inHeader := true
-
-	for respScanner.Scan() {
-		lineNo++
-		respLine := respScanner.Text()
-		var faxLine string
-		var isWildcard, ok bool
-
-		if inHeader && (len(respLine) == 0 || respLine[0] == '#') {
-			continue
-		}
-
-		for {
-			haveFaxLine := false
-
-			if inHeader {
-				for {
-					if faxLine, isWildcard, ok = nextLineFunc(faxScanner, &nextLineState); !ok {
-						break
-					}
-					if len(faxLine) != 0 && faxLine[0] != '#' {
-						haveFaxLine = true
-						break
-					}
-				}
-
-				inHeader = false
-			} else {
-				faxLine, isWildcard, haveFaxLine = nextLineFunc(faxScanner, &nextLineState)
-			}
-
-			if !haveFaxLine {
-				// Ignore blank lines at the end of the generated file.
-				if len(respLine) == 0 {
-					break
-				}
-				return fmt.Errorf("resp file is longer than fax for %q %q", suite.getDirectory(), test.inFile)
-			}
-
-			if strings.HasPrefix(faxLine, " (Reason: ") {
-				continue
-			}
-
-			break
-		}
-
-		if isWildcard || canonicalizeLine(faxLine) == canonicalizeLine(respLine) {
-			continue
-		}
-
-		return fmt.Errorf("resp and fax differ at line %d for %q %q: %q vs %q", lineNo, suite.getDirectory(), test.inFile, respLine, faxLine)
-	}
-
-	if _, _, ok := nextLineFunc(faxScanner, &nextLineState); ok {
-		return fmt.Errorf("fax file is longer than resp for %q %q", suite.getDirectory(), test.inFile)
-	}
-
-	return nil
-}
diff --git a/src/util/fipstools/delocate/delocate.peg b/src/util/fipstools/delocate/delocate.peg
index f79ed76..c253a48 100644
--- a/src/util/fipstools/delocate/delocate.peg
+++ b/src/util/fipstools/delocate/delocate.peg
@@ -44,7 +44,7 @@
 SymbolArg <- (OpenParen WS?)? (
                Offset /
                SymbolType /
-               (Offset / LocalSymbol / SymbolName / Dot) WS? Operator WS? (Offset / LocalSymbol / SymbolName) /
+               (Offset / LocalSymbol / SymbolName / Dot) (WS? Operator WS? (Offset / LocalSymbol / SymbolName))* /
                LocalSymbol TCMarker? /
                SymbolName Offset /
                SymbolName TCMarker?)
diff --git a/src/util/fipstools/delocate/delocate.peg.go b/src/util/fipstools/delocate/delocate.peg.go
index 56c4a20..ea7c195 100644
--- a/src/util/fipstools/delocate/delocate.peg.go
+++ b/src/util/fipstools/delocate/delocate.peg.go
@@ -2540,7 +2540,7 @@
 			position, tokenIndex = position291, tokenIndex291
 			return false
 		},
-		/* 16 SymbolArg <- <((OpenParen WS?)? (Offset / SymbolType / ((Offset / LocalSymbol / SymbolName / Dot) WS? Operator WS? (Offset / LocalSymbol / SymbolName)) / (LocalSymbol TCMarker?) / (SymbolName Offset) / (SymbolName TCMarker?)) (WS? CloseParen)? (WS? SymbolShift)?)> */
+		/* 16 SymbolArg <- <((OpenParen WS?)? (Offset / SymbolType / ((Offset / LocalSymbol / SymbolName / Dot) (WS? Operator WS? (Offset / LocalSymbol / SymbolName))*) / (LocalSymbol TCMarker?) / (SymbolName Offset) / (SymbolName TCMarker?)) (WS? CloseParen)? (WS? SymbolShift)?)> */
 		func() bool {
 			position299, tokenIndex299 := position, tokenIndex
 			{
@@ -2604,131 +2604,138 @@
 						}
 					}
 				l309:
+				l313:
 					{
-						position313, tokenIndex313 := position, tokenIndex
-						if !_rules[ruleWS]() {
-							goto l313
+						position314, tokenIndex314 := position, tokenIndex
+						{
+							position315, tokenIndex315 := position, tokenIndex
+							if !_rules[ruleWS]() {
+								goto l315
+							}
+							goto l316
+						l315:
+							position, tokenIndex = position315, tokenIndex315
 						}
-						goto l314
-					l313:
-						position, tokenIndex = position313, tokenIndex313
-					}
-				l314:
-					if !_rules[ruleOperator]() {
-						goto l308
-					}
-					{
-						position315, tokenIndex315 := position, tokenIndex
-						if !_rules[ruleWS]() {
-							goto l315
+					l316:
+						if !_rules[ruleOperator]() {
+							goto l314
 						}
-						goto l316
-					l315:
-						position, tokenIndex = position315, tokenIndex315
-					}
-				l316:
-					{
-						position317, tokenIndex317 := position, tokenIndex
-						if !_rules[ruleOffset]() {
+						{
+							position317, tokenIndex317 := position, tokenIndex
+							if !_rules[ruleWS]() {
+								goto l317
+							}
 							goto l318
+						l317:
+							position, tokenIndex = position317, tokenIndex317
 						}
-						goto l317
 					l318:
-						position, tokenIndex = position317, tokenIndex317
-						if !_rules[ruleLocalSymbol]() {
+						{
+							position319, tokenIndex319 := position, tokenIndex
+							if !_rules[ruleOffset]() {
+								goto l320
+							}
 							goto l319
+						l320:
+							position, tokenIndex = position319, tokenIndex319
+							if !_rules[ruleLocalSymbol]() {
+								goto l321
+							}
+							goto l319
+						l321:
+							position, tokenIndex = position319, tokenIndex319
+							if !_rules[ruleSymbolName]() {
+								goto l314
+							}
 						}
-						goto l317
 					l319:
-						position, tokenIndex = position317, tokenIndex317
-						if !_rules[ruleSymbolName]() {
-							goto l308
-						}
+						goto l313
+					l314:
+						position, tokenIndex = position314, tokenIndex314
 					}
-				l317:
 					goto l305
 				l308:
 					position, tokenIndex = position305, tokenIndex305
 					if !_rules[ruleLocalSymbol]() {
-						goto l320
+						goto l322
 					}
 					{
-						position321, tokenIndex321 := position, tokenIndex
+						position323, tokenIndex323 := position, tokenIndex
 						if !_rules[ruleTCMarker]() {
-							goto l321
+							goto l323
 						}
-						goto l322
-					l321:
-						position, tokenIndex = position321, tokenIndex321
+						goto l324
+					l323:
+						position, tokenIndex = position323, tokenIndex323
 					}
-				l322:
+				l324:
 					goto l305
-				l320:
+				l322:
 					position, tokenIndex = position305, tokenIndex305
 					if !_rules[ruleSymbolName]() {
-						goto l323
+						goto l325
 					}
 					if !_rules[ruleOffset]() {
-						goto l323
+						goto l325
 					}
 					goto l305
-				l323:
+				l325:
 					position, tokenIndex = position305, tokenIndex305
 					if !_rules[ruleSymbolName]() {
 						goto l299
 					}
 					{
-						position324, tokenIndex324 := position, tokenIndex
+						position326, tokenIndex326 := position, tokenIndex
 						if !_rules[ruleTCMarker]() {
-							goto l324
+							goto l326
 						}
-						goto l325
-					l324:
-						position, tokenIndex = position324, tokenIndex324
+						goto l327
+					l326:
+						position, tokenIndex = position326, tokenIndex326
 					}
-				l325:
+				l327:
 				}
 			l305:
 				{
-					position326, tokenIndex326 := position, tokenIndex
+					position328, tokenIndex328 := position, tokenIndex
 					{
-						position328, tokenIndex328 := position, tokenIndex
+						position330, tokenIndex330 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l328
+							goto l330
 						}
-						goto l329
-					l328:
-						position, tokenIndex = position328, tokenIndex328
+						goto l331
+					l330:
+						position, tokenIndex = position330, tokenIndex330
 					}
-				l329:
+				l331:
 					if !_rules[ruleCloseParen]() {
-						goto l326
+						goto l328
 					}
-					goto l327
-				l326:
-					position, tokenIndex = position326, tokenIndex326
+					goto l329
+				l328:
+					position, tokenIndex = position328, tokenIndex328
 				}
-			l327:
+			l329:
 				{
-					position330, tokenIndex330 := position, tokenIndex
+					position332, tokenIndex332 := position, tokenIndex
 					{
-						position332, tokenIndex332 := position, tokenIndex
+						position334, tokenIndex334 := position, tokenIndex
 						if !_rules[ruleWS]() {
-							goto l332
+							goto l334
 						}
-						goto l333
-					l332:
-						position, tokenIndex = position332, tokenIndex332
+						goto l335
+					l334:
+						position, tokenIndex = position334, tokenIndex334
 					}
-				l333:
+				l335:
 					if !_rules[ruleSymbolShift]() {
-						goto l330
+						goto l332
 					}
-					goto l331
-				l330:
-					position, tokenIndex = position330, tokenIndex330
+					goto l333
+				l332:
+					position, tokenIndex = position332, tokenIndex332
 				}
-			l331:
+			l333:
 				add(ruleSymbolArg, position300)
 			}
 			return true
@@ -2738,721 +2745,707 @@
 		},
 		/* 17 OpenParen <- <'('> */
 		func() bool {
-			position334, tokenIndex334 := position, tokenIndex
-			{
-				position335 := position
-				if buffer[position] != rune('(') {
-					goto l334
-				}
-				position++
-				add(ruleOpenParen, position335)
-			}
-			return true
-		l334:
-			position, tokenIndex = position334, tokenIndex334
-			return false
-		},
-		/* 18 CloseParen <- <')'> */
-		func() bool {
 			position336, tokenIndex336 := position, tokenIndex
 			{
 				position337 := position
-				if buffer[position] != rune(')') {
+				if buffer[position] != rune('(') {
 					goto l336
 				}
 				position++
-				add(ruleCloseParen, position337)
+				add(ruleOpenParen, position337)
 			}
 			return true
 		l336:
 			position, tokenIndex = position336, tokenIndex336
 			return false
 		},
-		/* 19 SymbolType <- <(('@' / '%') (('f' 'u' 'n' 'c' 't' 'i' 'o' 'n') / ('o' 'b' 'j' 'e' 'c' 't')))> */
+		/* 18 CloseParen <- <')'> */
 		func() bool {
 			position338, tokenIndex338 := position, tokenIndex
 			{
 				position339 := position
-				{
-					position340, tokenIndex340 := position, tokenIndex
-					if buffer[position] != rune('@') {
-						goto l341
-					}
-					position++
-					goto l340
-				l341:
-					position, tokenIndex = position340, tokenIndex340
-					if buffer[position] != rune('%') {
-						goto l338
-					}
-					position++
+				if buffer[position] != rune(')') {
+					goto l338
 				}
-			l340:
-				{
-					position342, tokenIndex342 := position, tokenIndex
-					if buffer[position] != rune('f') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('u') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('n') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('c') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('t') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('i') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('o') {
-						goto l343
-					}
-					position++
-					if buffer[position] != rune('n') {
-						goto l343
-					}
-					position++
-					goto l342
-				l343:
-					position, tokenIndex = position342, tokenIndex342
-					if buffer[position] != rune('o') {
-						goto l338
-					}
-					position++
-					if buffer[position] != rune('b') {
-						goto l338
-					}
-					position++
-					if buffer[position] != rune('j') {
-						goto l338
-					}
-					position++
-					if buffer[position] != rune('e') {
-						goto l338
-					}
-					position++
-					if buffer[position] != rune('c') {
-						goto l338
-					}
-					position++
-					if buffer[position] != rune('t') {
-						goto l338
-					}
-					position++
-				}
-			l342:
-				add(ruleSymbolType, position339)
+				position++
+				add(ruleCloseParen, position339)
 			}
 			return true
 		l338:
 			position, tokenIndex = position338, tokenIndex338
 			return false
 		},
-		/* 20 Dot <- <'.'> */
+		/* 19 SymbolType <- <(('@' / '%') (('f' 'u' 'n' 'c' 't' 'i' 'o' 'n') / ('o' 'b' 'j' 'e' 'c' 't')))> */
 		func() bool {
-			position344, tokenIndex344 := position, tokenIndex
+			position340, tokenIndex340 := position, tokenIndex
 			{
-				position345 := position
-				if buffer[position] != rune('.') {
-					goto l344
+				position341 := position
+				{
+					position342, tokenIndex342 := position, tokenIndex
+					if buffer[position] != rune('@') {
+						goto l343
+					}
+					position++
+					goto l342
+				l343:
+					position, tokenIndex = position342, tokenIndex342
+					if buffer[position] != rune('%') {
+						goto l340
+					}
+					position++
 				}
-				position++
-				add(ruleDot, position345)
+			l342:
+				{
+					position344, tokenIndex344 := position, tokenIndex
+					if buffer[position] != rune('f') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('u') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('n') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('c') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('t') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('i') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('o') {
+						goto l345
+					}
+					position++
+					if buffer[position] != rune('n') {
+						goto l345
+					}
+					position++
+					goto l344
+				l345:
+					position, tokenIndex = position344, tokenIndex344
+					if buffer[position] != rune('o') {
+						goto l340
+					}
+					position++
+					if buffer[position] != rune('b') {
+						goto l340
+					}
+					position++
+					if buffer[position] != rune('j') {
+						goto l340
+					}
+					position++
+					if buffer[position] != rune('e') {
+						goto l340
+					}
+					position++
+					if buffer[position] != rune('c') {
+						goto l340
+					}
+					position++
+					if buffer[position] != rune('t') {
+						goto l340
+					}
+					position++
+				}
+			l344:
+				add(ruleSymbolType, position341)
 			}
 			return true
-		l344:
-			position, tokenIndex = position344, tokenIndex344
+		l340:
+			position, tokenIndex = position340, tokenIndex340
 			return false
 		},
-		/* 21 TCMarker <- <('[' 'T' 'C' ']')> */
+		/* 20 Dot <- <'.'> */
 		func() bool {
 			position346, tokenIndex346 := position, tokenIndex
 			{
 				position347 := position
-				if buffer[position] != rune('[') {
+				if buffer[position] != rune('.') {
 					goto l346
 				}
 				position++
-				if buffer[position] != rune('T') {
-					goto l346
-				}
-				position++
-				if buffer[position] != rune('C') {
-					goto l346
-				}
-				position++
-				if buffer[position] != rune(']') {
-					goto l346
-				}
-				position++
-				add(ruleTCMarker, position347)
+				add(ruleDot, position347)
 			}
 			return true
 		l346:
 			position, tokenIndex = position346, tokenIndex346
 			return false
 		},
-		/* 22 EscapedChar <- <('\\' .)> */
+		/* 21 TCMarker <- <('[' 'T' 'C' ']')> */
 		func() bool {
 			position348, tokenIndex348 := position, tokenIndex
 			{
 				position349 := position
-				if buffer[position] != rune('\\') {
+				if buffer[position] != rune('[') {
 					goto l348
 				}
 				position++
-				if !matchDot() {
+				if buffer[position] != rune('T') {
 					goto l348
 				}
-				add(ruleEscapedChar, position349)
+				position++
+				if buffer[position] != rune('C') {
+					goto l348
+				}
+				position++
+				if buffer[position] != rune(']') {
+					goto l348
+				}
+				position++
+				add(ruleTCMarker, position349)
 			}
 			return true
 		l348:
 			position, tokenIndex = position348, tokenIndex348
 			return false
 		},
-		/* 23 WS <- <(' ' / '\t')+> */
+		/* 22 EscapedChar <- <('\\' .)> */
 		func() bool {
 			position350, tokenIndex350 := position, tokenIndex
 			{
 				position351 := position
-				{
-					position354, tokenIndex354 := position, tokenIndex
-					if buffer[position] != rune(' ') {
-						goto l355
-					}
-					position++
-					goto l354
-				l355:
-					position, tokenIndex = position354, tokenIndex354
-					if buffer[position] != rune('\t') {
-						goto l350
-					}
-					position++
+				if buffer[position] != rune('\\') {
+					goto l350
 				}
-			l354:
-			l352:
-				{
-					position353, tokenIndex353 := position, tokenIndex
-					{
-						position356, tokenIndex356 := position, tokenIndex
-						if buffer[position] != rune(' ') {
-							goto l357
-						}
-						position++
-						goto l356
-					l357:
-						position, tokenIndex = position356, tokenIndex356
-						if buffer[position] != rune('\t') {
-							goto l353
-						}
-						position++
-					}
-				l356:
-					goto l352
-				l353:
-					position, tokenIndex = position353, tokenIndex353
+				position++
+				if !matchDot() {
+					goto l350
 				}
-				add(ruleWS, position351)
+				add(ruleEscapedChar, position351)
 			}
 			return true
 		l350:
 			position, tokenIndex = position350, tokenIndex350
 			return false
 		},
-		/* 24 Comment <- <((('/' '/') / '#') (!'\n' .)*)> */
+		/* 23 WS <- <(' ' / '\t')+> */
 		func() bool {
-			position358, tokenIndex358 := position, tokenIndex
+			position352, tokenIndex352 := position, tokenIndex
 			{
-				position359 := position
+				position353 := position
 				{
-					position360, tokenIndex360 := position, tokenIndex
-					if buffer[position] != rune('/') {
-						goto l361
+					position356, tokenIndex356 := position, tokenIndex
+					if buffer[position] != rune(' ') {
+						goto l357
 					}
 					position++
-					if buffer[position] != rune('/') {
-						goto l361
-					}
-					position++
-					goto l360
-				l361:
-					position, tokenIndex = position360, tokenIndex360
-					if buffer[position] != rune('#') {
-						goto l358
+					goto l356
+				l357:
+					position, tokenIndex = position356, tokenIndex356
+					if buffer[position] != rune('\t') {
+						goto l352
 					}
 					position++
 				}
-			l360:
-			l362:
+			l356:
+			l354:
 				{
-					position363, tokenIndex363 := position, tokenIndex
+					position355, tokenIndex355 := position, tokenIndex
 					{
-						position364, tokenIndex364 := position, tokenIndex
-						if buffer[position] != rune('\n') {
-							goto l364
+						position358, tokenIndex358 := position, tokenIndex
+						if buffer[position] != rune(' ') {
+							goto l359
 						}
 						position++
-						goto l363
-					l364:
-						position, tokenIndex = position364, tokenIndex364
+						goto l358
+					l359:
+						position, tokenIndex = position358, tokenIndex358
+						if buffer[position] != rune('\t') {
+							goto l355
+						}
+						position++
 					}
-					if !matchDot() {
-						goto l363
-					}
-					goto l362
-				l363:
-					position, tokenIndex = position363, tokenIndex363
+				l358:
+					goto l354
+				l355:
+					position, tokenIndex = position355, tokenIndex355
 				}
-				add(ruleComment, position359)
+				add(ruleWS, position353)
 			}
 			return true
-		l358:
-			position, tokenIndex = position358, tokenIndex358
+		l352:
+			position, tokenIndex = position352, tokenIndex352
+			return false
+		},
+		/* 24 Comment <- <((('/' '/') / '#') (!'\n' .)*)> */
+		func() bool {
+			position360, tokenIndex360 := position, tokenIndex
+			{
+				position361 := position
+				{
+					position362, tokenIndex362 := position, tokenIndex
+					if buffer[position] != rune('/') {
+						goto l363
+					}
+					position++
+					if buffer[position] != rune('/') {
+						goto l363
+					}
+					position++
+					goto l362
+				l363:
+					position, tokenIndex = position362, tokenIndex362
+					if buffer[position] != rune('#') {
+						goto l360
+					}
+					position++
+				}
+			l362:
+			l364:
+				{
+					position365, tokenIndex365 := position, tokenIndex
+					{
+						position366, tokenIndex366 := position, tokenIndex
+						if buffer[position] != rune('\n') {
+							goto l366
+						}
+						position++
+						goto l365
+					l366:
+						position, tokenIndex = position366, tokenIndex366
+					}
+					if !matchDot() {
+						goto l365
+					}
+					goto l364
+				l365:
+					position, tokenIndex = position365, tokenIndex365
+				}
+				add(ruleComment, position361)
+			}
+			return true
+		l360:
+			position, tokenIndex = position360, tokenIndex360
 			return false
 		},
 		/* 25 Label <- <((LocalSymbol / LocalLabel / SymbolName) ':')> */
 		func() bool {
-			position365, tokenIndex365 := position, tokenIndex
+			position367, tokenIndex367 := position, tokenIndex
 			{
-				position366 := position
+				position368 := position
 				{
-					position367, tokenIndex367 := position, tokenIndex
+					position369, tokenIndex369 := position, tokenIndex
 					if !_rules[ruleLocalSymbol]() {
-						goto l368
+						goto l370
 					}
-					goto l367
-				l368:
-					position, tokenIndex = position367, tokenIndex367
+					goto l369
+				l370:
+					position, tokenIndex = position369, tokenIndex369
 					if !_rules[ruleLocalLabel]() {
-						goto l369
+						goto l371
 					}
-					goto l367
-				l369:
-					position, tokenIndex = position367, tokenIndex367
+					goto l369
+				l371:
+					position, tokenIndex = position369, tokenIndex369
 					if !_rules[ruleSymbolName]() {
-						goto l365
+						goto l367
 					}
 				}
-			l367:
+			l369:
 				if buffer[position] != rune(':') {
-					goto l365
+					goto l367
 				}
 				position++
-				add(ruleLabel, position366)
+				add(ruleLabel, position368)
 			}
 			return true
-		l365:
-			position, tokenIndex = position365, tokenIndex365
+		l367:
+			position, tokenIndex = position367, tokenIndex367
 			return false
 		},
 		/* 26 SymbolName <- <(([a-z] / [A-Z] / '.' / '_') ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]) / '$' / '_')*)> */
 		func() bool {
-			position370, tokenIndex370 := position, tokenIndex
+			position372, tokenIndex372 := position, tokenIndex
 			{
-				position371 := position
+				position373 := position
 				{
-					position372, tokenIndex372 := position, tokenIndex
+					position374, tokenIndex374 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l373
-					}
-					position++
-					goto l372
-				l373:
-					position, tokenIndex = position372, tokenIndex372
-					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l374
-					}
-					position++
-					goto l372
-				l374:
-					position, tokenIndex = position372, tokenIndex372
-					if buffer[position] != rune('.') {
 						goto l375
 					}
 					position++
-					goto l372
+					goto l374
 				l375:
-					position, tokenIndex = position372, tokenIndex372
+					position, tokenIndex = position374, tokenIndex374
+					if c := buffer[position]; c < rune('A') || c > rune('Z') {
+						goto l376
+					}
+					position++
+					goto l374
+				l376:
+					position, tokenIndex = position374, tokenIndex374
+					if buffer[position] != rune('.') {
+						goto l377
+					}
+					position++
+					goto l374
+				l377:
+					position, tokenIndex = position374, tokenIndex374
 					if buffer[position] != rune('_') {
-						goto l370
+						goto l372
 					}
 					position++
 				}
-			l372:
-			l376:
+			l374:
+			l378:
 				{
-					position377, tokenIndex377 := position, tokenIndex
+					position379, tokenIndex379 := position, tokenIndex
 					{
-						position378, tokenIndex378 := position, tokenIndex
+						position380, tokenIndex380 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l379
-						}
-						position++
-						goto l378
-					l379:
-						position, tokenIndex = position378, tokenIndex378
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l380
-						}
-						position++
-						goto l378
-					l380:
-						position, tokenIndex = position378, tokenIndex378
-						if buffer[position] != rune('.') {
 							goto l381
 						}
 						position++
-						goto l378
+						goto l380
 					l381:
-						position, tokenIndex = position378, tokenIndex378
+						position, tokenIndex = position380, tokenIndex380
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
+							goto l382
+						}
+						position++
+						goto l380
+					l382:
+						position, tokenIndex = position380, tokenIndex380
+						if buffer[position] != rune('.') {
+							goto l383
+						}
+						position++
+						goto l380
+					l383:
+						position, tokenIndex = position380, tokenIndex380
 						{
-							position383, tokenIndex383 := position, tokenIndex
+							position385, tokenIndex385 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l386
+							}
+							position++
+							goto l385
+						l386:
+							position, tokenIndex = position385, tokenIndex385
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
 								goto l384
 							}
 							position++
-							goto l383
-						l384:
-							position, tokenIndex = position383, tokenIndex383
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l382
-							}
-							position++
 						}
-					l383:
-						goto l378
-					l382:
-						position, tokenIndex = position378, tokenIndex378
+					l385:
+						goto l380
+					l384:
+						position, tokenIndex = position380, tokenIndex380
 						if buffer[position] != rune('$') {
-							goto l385
+							goto l387
 						}
 						position++
-						goto l378
-					l385:
-						position, tokenIndex = position378, tokenIndex378
+						goto l380
+					l387:
+						position, tokenIndex = position380, tokenIndex380
 						if buffer[position] != rune('_') {
-							goto l377
+							goto l379
 						}
 						position++
 					}
-				l378:
-					goto l376
-				l377:
-					position, tokenIndex = position377, tokenIndex377
+				l380:
+					goto l378
+				l379:
+					position, tokenIndex = position379, tokenIndex379
 				}
-				add(ruleSymbolName, position371)
+				add(ruleSymbolName, position373)
 			}
 			return true
-		l370:
-			position, tokenIndex = position370, tokenIndex370
+		l372:
+			position, tokenIndex = position372, tokenIndex372
 			return false
 		},
 		/* 27 LocalSymbol <- <('.' 'L' ([a-z] / [A-Z] / ([a-z] / [A-Z]) / '.' / ([0-9] / [0-9]) / '$' / '_')+)> */
 		func() bool {
-			position386, tokenIndex386 := position, tokenIndex
+			position388, tokenIndex388 := position, tokenIndex
 			{
-				position387 := position
+				position389 := position
 				if buffer[position] != rune('.') {
-					goto l386
+					goto l388
 				}
 				position++
 				if buffer[position] != rune('L') {
-					goto l386
+					goto l388
 				}
 				position++
 				{
-					position390, tokenIndex390 := position, tokenIndex
+					position392, tokenIndex392 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l391
+						goto l393
 					}
 					position++
-					goto l390
-				l391:
-					position, tokenIndex = position390, tokenIndex390
+					goto l392
+				l393:
+					position, tokenIndex = position392, tokenIndex392
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l392
+						goto l394
 					}
 					position++
-					goto l390
-				l392:
-					position, tokenIndex = position390, tokenIndex390
+					goto l392
+				l394:
+					position, tokenIndex = position392, tokenIndex392
 					{
-						position394, tokenIndex394 := position, tokenIndex
+						position396, tokenIndex396 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
+							goto l397
+						}
+						position++
+						goto l396
+					l397:
+						position, tokenIndex = position396, tokenIndex396
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l395
 						}
 						position++
-						goto l394
-					l395:
-						position, tokenIndex = position394, tokenIndex394
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l393
-						}
-						position++
 					}
-				l394:
-					goto l390
-				l393:
-					position, tokenIndex = position390, tokenIndex390
+				l396:
+					goto l392
+				l395:
+					position, tokenIndex = position392, tokenIndex392
 					if buffer[position] != rune('.') {
-						goto l396
+						goto l398
 					}
 					position++
-					goto l390
-				l396:
-					position, tokenIndex = position390, tokenIndex390
+					goto l392
+				l398:
+					position, tokenIndex = position392, tokenIndex392
 					{
-						position398, tokenIndex398 := position, tokenIndex
+						position400, tokenIndex400 := position, tokenIndex
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l401
+						}
+						position++
+						goto l400
+					l401:
+						position, tokenIndex = position400, tokenIndex400
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
 							goto l399
 						}
 						position++
-						goto l398
-					l399:
-						position, tokenIndex = position398, tokenIndex398
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l397
-						}
-						position++
 					}
-				l398:
-					goto l390
-				l397:
-					position, tokenIndex = position390, tokenIndex390
+				l400:
+					goto l392
+				l399:
+					position, tokenIndex = position392, tokenIndex392
 					if buffer[position] != rune('$') {
-						goto l400
+						goto l402
 					}
 					position++
-					goto l390
-				l400:
-					position, tokenIndex = position390, tokenIndex390
+					goto l392
+				l402:
+					position, tokenIndex = position392, tokenIndex392
 					if buffer[position] != rune('_') {
-						goto l386
+						goto l388
 					}
 					position++
 				}
+			l392:
 			l390:
-			l388:
 				{
-					position389, tokenIndex389 := position, tokenIndex
+					position391, tokenIndex391 := position, tokenIndex
 					{
-						position401, tokenIndex401 := position, tokenIndex
+						position403, tokenIndex403 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l402
+							goto l404
 						}
 						position++
-						goto l401
-					l402:
-						position, tokenIndex = position401, tokenIndex401
+						goto l403
+					l404:
+						position, tokenIndex = position403, tokenIndex403
 						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l403
+							goto l405
 						}
 						position++
-						goto l401
-					l403:
-						position, tokenIndex = position401, tokenIndex401
+						goto l403
+					l405:
+						position, tokenIndex = position403, tokenIndex403
 						{
-							position405, tokenIndex405 := position, tokenIndex
+							position407, tokenIndex407 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
+								goto l408
+							}
+							position++
+							goto l407
+						l408:
+							position, tokenIndex = position407, tokenIndex407
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
 								goto l406
 							}
 							position++
-							goto l405
-						l406:
-							position, tokenIndex = position405, tokenIndex405
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l404
-							}
-							position++
 						}
-					l405:
-						goto l401
-					l404:
-						position, tokenIndex = position401, tokenIndex401
+					l407:
+						goto l403
+					l406:
+						position, tokenIndex = position403, tokenIndex403
 						if buffer[position] != rune('.') {
-							goto l407
+							goto l409
 						}
 						position++
-						goto l401
-					l407:
-						position, tokenIndex = position401, tokenIndex401
+						goto l403
+					l409:
+						position, tokenIndex = position403, tokenIndex403
 						{
-							position409, tokenIndex409 := position, tokenIndex
+							position411, tokenIndex411 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l412
+							}
+							position++
+							goto l411
+						l412:
+							position, tokenIndex = position411, tokenIndex411
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
 								goto l410
 							}
 							position++
-							goto l409
-						l410:
-							position, tokenIndex = position409, tokenIndex409
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l408
-							}
-							position++
 						}
-					l409:
-						goto l401
-					l408:
-						position, tokenIndex = position401, tokenIndex401
+					l411:
+						goto l403
+					l410:
+						position, tokenIndex = position403, tokenIndex403
 						if buffer[position] != rune('$') {
-							goto l411
+							goto l413
 						}
 						position++
-						goto l401
-					l411:
-						position, tokenIndex = position401, tokenIndex401
+						goto l403
+					l413:
+						position, tokenIndex = position403, tokenIndex403
 						if buffer[position] != rune('_') {
-							goto l389
+							goto l391
 						}
 						position++
 					}
-				l401:
-					goto l388
-				l389:
-					position, tokenIndex = position389, tokenIndex389
+				l403:
+					goto l390
+				l391:
+					position, tokenIndex = position391, tokenIndex391
 				}
-				add(ruleLocalSymbol, position387)
+				add(ruleLocalSymbol, position389)
 			}
 			return true
-		l386:
-			position, tokenIndex = position386, tokenIndex386
+		l388:
+			position, tokenIndex = position388, tokenIndex388
 			return false
 		},
 		/* 28 LocalLabel <- <([0-9] ([0-9] / '$')*)> */
 		func() bool {
-			position412, tokenIndex412 := position, tokenIndex
+			position414, tokenIndex414 := position, tokenIndex
 			{
-				position413 := position
+				position415 := position
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l412
+					goto l414
 				}
 				position++
-			l414:
+			l416:
 				{
-					position415, tokenIndex415 := position, tokenIndex
+					position417, tokenIndex417 := position, tokenIndex
 					{
-						position416, tokenIndex416 := position, tokenIndex
+						position418, tokenIndex418 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l419
+						}
+						position++
+						goto l418
+					l419:
+						position, tokenIndex = position418, tokenIndex418
+						if buffer[position] != rune('$') {
 							goto l417
 						}
 						position++
-						goto l416
-					l417:
-						position, tokenIndex = position416, tokenIndex416
-						if buffer[position] != rune('$') {
-							goto l415
-						}
-						position++
 					}
-				l416:
-					goto l414
-				l415:
-					position, tokenIndex = position415, tokenIndex415
+				l418:
+					goto l416
+				l417:
+					position, tokenIndex = position417, tokenIndex417
 				}
-				add(ruleLocalLabel, position413)
+				add(ruleLocalLabel, position415)
 			}
 			return true
-		l412:
-			position, tokenIndex = position412, tokenIndex412
+		l414:
+			position, tokenIndex = position414, tokenIndex414
 			return false
 		},
 		/* 29 LocalLabelRef <- <([0-9] ([0-9] / '$')* ('b' / 'f'))> */
 		func() bool {
-			position418, tokenIndex418 := position, tokenIndex
+			position420, tokenIndex420 := position, tokenIndex
 			{
-				position419 := position
+				position421 := position
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l418
+					goto l420
 				}
 				position++
-			l420:
+			l422:
 				{
-					position421, tokenIndex421 := position, tokenIndex
+					position423, tokenIndex423 := position, tokenIndex
 					{
-						position422, tokenIndex422 := position, tokenIndex
+						position424, tokenIndex424 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l425
+						}
+						position++
+						goto l424
+					l425:
+						position, tokenIndex = position424, tokenIndex424
+						if buffer[position] != rune('$') {
 							goto l423
 						}
 						position++
-						goto l422
-					l423:
-						position, tokenIndex = position422, tokenIndex422
-						if buffer[position] != rune('$') {
-							goto l421
-						}
-						position++
 					}
-				l422:
-					goto l420
-				l421:
-					position, tokenIndex = position421, tokenIndex421
+				l424:
+					goto l422
+				l423:
+					position, tokenIndex = position423, tokenIndex423
 				}
 				{
-					position424, tokenIndex424 := position, tokenIndex
+					position426, tokenIndex426 := position, tokenIndex
 					if buffer[position] != rune('b') {
-						goto l425
+						goto l427
 					}
 					position++
-					goto l424
-				l425:
-					position, tokenIndex = position424, tokenIndex424
+					goto l426
+				l427:
+					position, tokenIndex = position426, tokenIndex426
 					if buffer[position] != rune('f') {
-						goto l418
+						goto l420
 					}
 					position++
 				}
-			l424:
-				add(ruleLocalLabelRef, position419)
+			l426:
+				add(ruleLocalLabelRef, position421)
 			}
 			return true
-		l418:
-			position, tokenIndex = position418, tokenIndex418
+		l420:
+			position, tokenIndex = position420, tokenIndex420
 			return false
 		},
 		/* 30 Instruction <- <(InstructionName (WS InstructionArg (WS? ',' WS? InstructionArg)*)?)> */
 		func() bool {
-			position426, tokenIndex426 := position, tokenIndex
+			position428, tokenIndex428 := position, tokenIndex
 			{
-				position427 := position
+				position429 := position
 				if !_rules[ruleInstructionName]() {
-					goto l426
+					goto l428
 				}
 				{
-					position428, tokenIndex428 := position, tokenIndex
+					position430, tokenIndex430 := position, tokenIndex
 					if !_rules[ruleWS]() {
-						goto l428
+						goto l430
 					}
 					if !_rules[ruleInstructionArg]() {
-						goto l428
+						goto l430
 					}
-				l430:
+				l432:
 					{
-						position431, tokenIndex431 := position, tokenIndex
-						{
-							position432, tokenIndex432 := position, tokenIndex
-							if !_rules[ruleWS]() {
-								goto l432
-							}
-							goto l433
-						l432:
-							position, tokenIndex = position432, tokenIndex432
-						}
-					l433:
-						if buffer[position] != rune(',') {
-							goto l431
-						}
-						position++
+						position433, tokenIndex433 := position, tokenIndex
 						{
 							position434, tokenIndex434 := position, tokenIndex
 							if !_rules[ruleWS]() {
@@ -3463,1013 +3456,1009 @@
 							position, tokenIndex = position434, tokenIndex434
 						}
 					l435:
-						if !_rules[ruleInstructionArg]() {
-							goto l431
+						if buffer[position] != rune(',') {
+							goto l433
 						}
-						goto l430
-					l431:
-						position, tokenIndex = position431, tokenIndex431
+						position++
+						{
+							position436, tokenIndex436 := position, tokenIndex
+							if !_rules[ruleWS]() {
+								goto l436
+							}
+							goto l437
+						l436:
+							position, tokenIndex = position436, tokenIndex436
+						}
+					l437:
+						if !_rules[ruleInstructionArg]() {
+							goto l433
+						}
+						goto l432
+					l433:
+						position, tokenIndex = position433, tokenIndex433
 					}
-					goto l429
-				l428:
-					position, tokenIndex = position428, tokenIndex428
+					goto l431
+				l430:
+					position, tokenIndex = position430, tokenIndex430
 				}
-			l429:
-				add(ruleInstruction, position427)
+			l431:
+				add(ruleInstruction, position429)
 			}
 			return true
-		l426:
-			position, tokenIndex = position426, tokenIndex426
+		l428:
+			position, tokenIndex = position428, tokenIndex428
 			return false
 		},
 		/* 31 InstructionName <- <(([a-z] / [A-Z]) ([a-z] / [A-Z] / '.' / ([0-9] / [0-9]))* ('.' / '+' / '-')?)> */
 		func() bool {
-			position436, tokenIndex436 := position, tokenIndex
+			position438, tokenIndex438 := position, tokenIndex
 			{
-				position437 := position
+				position439 := position
 				{
-					position438, tokenIndex438 := position, tokenIndex
+					position440, tokenIndex440 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l439
+						goto l441
 					}
 					position++
-					goto l438
-				l439:
-					position, tokenIndex = position438, tokenIndex438
+					goto l440
+				l441:
+					position, tokenIndex = position440, tokenIndex440
 					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l436
+						goto l438
 					}
 					position++
 				}
-			l438:
 			l440:
+			l442:
 				{
-					position441, tokenIndex441 := position, tokenIndex
+					position443, tokenIndex443 := position, tokenIndex
 					{
-						position442, tokenIndex442 := position, tokenIndex
+						position444, tokenIndex444 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l443
-						}
-						position++
-						goto l442
-					l443:
-						position, tokenIndex = position442, tokenIndex442
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l444
-						}
-						position++
-						goto l442
-					l444:
-						position, tokenIndex = position442, tokenIndex442
-						if buffer[position] != rune('.') {
 							goto l445
 						}
 						position++
-						goto l442
+						goto l444
 					l445:
-						position, tokenIndex = position442, tokenIndex442
+						position, tokenIndex = position444, tokenIndex444
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
+							goto l446
+						}
+						position++
+						goto l444
+					l446:
+						position, tokenIndex = position444, tokenIndex444
+						if buffer[position] != rune('.') {
+							goto l447
+						}
+						position++
+						goto l444
+					l447:
+						position, tokenIndex = position444, tokenIndex444
 						{
-							position446, tokenIndex446 := position, tokenIndex
+							position448, tokenIndex448 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l447
+								goto l449
 							}
 							position++
-							goto l446
-						l447:
-							position, tokenIndex = position446, tokenIndex446
+							goto l448
+						l449:
+							position, tokenIndex = position448, tokenIndex448
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l441
+								goto l443
 							}
 							position++
 						}
-					l446:
+					l448:
 					}
-				l442:
-					goto l440
-				l441:
-					position, tokenIndex = position441, tokenIndex441
+				l444:
+					goto l442
+				l443:
+					position, tokenIndex = position443, tokenIndex443
 				}
 				{
-					position448, tokenIndex448 := position, tokenIndex
+					position450, tokenIndex450 := position, tokenIndex
 					{
-						position450, tokenIndex450 := position, tokenIndex
+						position452, tokenIndex452 := position, tokenIndex
 						if buffer[position] != rune('.') {
-							goto l451
+							goto l453
 						}
 						position++
-						goto l450
-					l451:
-						position, tokenIndex = position450, tokenIndex450
+						goto l452
+					l453:
+						position, tokenIndex = position452, tokenIndex452
 						if buffer[position] != rune('+') {
-							goto l452
+							goto l454
 						}
 						position++
-						goto l450
-					l452:
-						position, tokenIndex = position450, tokenIndex450
+						goto l452
+					l454:
+						position, tokenIndex = position452, tokenIndex452
 						if buffer[position] != rune('-') {
-							goto l448
+							goto l450
 						}
 						position++
 					}
+				l452:
+					goto l451
 				l450:
-					goto l449
-				l448:
-					position, tokenIndex = position448, tokenIndex448
+					position, tokenIndex = position450, tokenIndex450
 				}
-			l449:
-				add(ruleInstructionName, position437)
+			l451:
+				add(ruleInstructionName, position439)
 			}
 			return true
-		l436:
-			position, tokenIndex = position436, tokenIndex436
+		l438:
+			position, tokenIndex = position438, tokenIndex438
 			return false
 		},
 		/* 32 InstructionArg <- <(IndirectionIndicator? (ARMConstantTweak / RegisterOrConstant / LocalLabelRef / TOCRefHigh / TOCRefLow / GOTLocation / GOTSymbolOffset / MemoryRef) AVX512Token*)> */
 		func() bool {
-			position453, tokenIndex453 := position, tokenIndex
+			position455, tokenIndex455 := position, tokenIndex
 			{
-				position454 := position
-				{
-					position455, tokenIndex455 := position, tokenIndex
-					if !_rules[ruleIndirectionIndicator]() {
-						goto l455
-					}
-					goto l456
-				l455:
-					position, tokenIndex = position455, tokenIndex455
-				}
-			l456:
+				position456 := position
 				{
 					position457, tokenIndex457 := position, tokenIndex
+					if !_rules[ruleIndirectionIndicator]() {
+						goto l457
+					}
+					goto l458
+				l457:
+					position, tokenIndex = position457, tokenIndex457
+				}
+			l458:
+				{
+					position459, tokenIndex459 := position, tokenIndex
 					if !_rules[ruleARMConstantTweak]() {
-						goto l458
-					}
-					goto l457
-				l458:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleRegisterOrConstant]() {
-						goto l459
-					}
-					goto l457
-				l459:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleLocalLabelRef]() {
 						goto l460
 					}
-					goto l457
+					goto l459
 				l460:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleTOCRefHigh]() {
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleRegisterOrConstant]() {
 						goto l461
 					}
-					goto l457
+					goto l459
 				l461:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleTOCRefLow]() {
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleLocalLabelRef]() {
 						goto l462
 					}
-					goto l457
+					goto l459
 				l462:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleGOTLocation]() {
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleTOCRefHigh]() {
 						goto l463
 					}
-					goto l457
+					goto l459
 				l463:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleGOTSymbolOffset]() {
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleTOCRefLow]() {
 						goto l464
 					}
-					goto l457
+					goto l459
 				l464:
-					position, tokenIndex = position457, tokenIndex457
-					if !_rules[ruleMemoryRef]() {
-						goto l453
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleGOTLocation]() {
+						goto l465
 					}
-				}
-			l457:
-			l465:
-				{
-					position466, tokenIndex466 := position, tokenIndex
-					if !_rules[ruleAVX512Token]() {
+					goto l459
+				l465:
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleGOTSymbolOffset]() {
 						goto l466
 					}
-					goto l465
+					goto l459
 				l466:
-					position, tokenIndex = position466, tokenIndex466
+					position, tokenIndex = position459, tokenIndex459
+					if !_rules[ruleMemoryRef]() {
+						goto l455
+					}
 				}
-				add(ruleInstructionArg, position454)
+			l459:
+			l467:
+				{
+					position468, tokenIndex468 := position, tokenIndex
+					if !_rules[ruleAVX512Token]() {
+						goto l468
+					}
+					goto l467
+				l468:
+					position, tokenIndex = position468, tokenIndex468
+				}
+				add(ruleInstructionArg, position456)
 			}
 			return true
-		l453:
-			position, tokenIndex = position453, tokenIndex453
+		l455:
+			position, tokenIndex = position455, tokenIndex455
 			return false
 		},
 		/* 33 GOTLocation <- <('$' '_' 'G' 'L' 'O' 'B' 'A' 'L' '_' 'O' 'F' 'F' 'S' 'E' 'T' '_' 'T' 'A' 'B' 'L' 'E' '_' '-' LocalSymbol)> */
 		func() bool {
-			position467, tokenIndex467 := position, tokenIndex
-			{
-				position468 := position
-				if buffer[position] != rune('$') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('_') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('G') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('L') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('O') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('B') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('A') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('L') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('_') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('O') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('F') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('F') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('S') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('E') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('T') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('_') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('T') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('A') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('B') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('L') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('E') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('_') {
-					goto l467
-				}
-				position++
-				if buffer[position] != rune('-') {
-					goto l467
-				}
-				position++
-				if !_rules[ruleLocalSymbol]() {
-					goto l467
-				}
-				add(ruleGOTLocation, position468)
-			}
-			return true
-		l467:
-			position, tokenIndex = position467, tokenIndex467
-			return false
-		},
-		/* 34 GOTSymbolOffset <- <(('$' SymbolName ('@' 'G' 'O' 'T') ('O' 'F' 'F')?) / (':' ('g' / 'G') ('o' / 'O') ('t' / 'T') ':' SymbolName))> */
-		func() bool {
 			position469, tokenIndex469 := position, tokenIndex
 			{
 				position470 := position
-				{
-					position471, tokenIndex471 := position, tokenIndex
-					if buffer[position] != rune('$') {
-						goto l472
-					}
-					position++
-					if !_rules[ruleSymbolName]() {
-						goto l472
-					}
-					if buffer[position] != rune('@') {
-						goto l472
-					}
-					position++
-					if buffer[position] != rune('G') {
-						goto l472
-					}
-					position++
-					if buffer[position] != rune('O') {
-						goto l472
-					}
-					position++
-					if buffer[position] != rune('T') {
-						goto l472
-					}
-					position++
-					{
-						position473, tokenIndex473 := position, tokenIndex
-						if buffer[position] != rune('O') {
-							goto l473
-						}
-						position++
-						if buffer[position] != rune('F') {
-							goto l473
-						}
-						position++
-						if buffer[position] != rune('F') {
-							goto l473
-						}
-						position++
-						goto l474
-					l473:
-						position, tokenIndex = position473, tokenIndex473
-					}
-				l474:
-					goto l471
-				l472:
-					position, tokenIndex = position471, tokenIndex471
-					if buffer[position] != rune(':') {
-						goto l469
-					}
-					position++
-					{
-						position475, tokenIndex475 := position, tokenIndex
-						if buffer[position] != rune('g') {
-							goto l476
-						}
-						position++
-						goto l475
-					l476:
-						position, tokenIndex = position475, tokenIndex475
-						if buffer[position] != rune('G') {
-							goto l469
-						}
-						position++
-					}
-				l475:
-					{
-						position477, tokenIndex477 := position, tokenIndex
-						if buffer[position] != rune('o') {
-							goto l478
-						}
-						position++
-						goto l477
-					l478:
-						position, tokenIndex = position477, tokenIndex477
-						if buffer[position] != rune('O') {
-							goto l469
-						}
-						position++
-					}
-				l477:
-					{
-						position479, tokenIndex479 := position, tokenIndex
-						if buffer[position] != rune('t') {
-							goto l480
-						}
-						position++
-						goto l479
-					l480:
-						position, tokenIndex = position479, tokenIndex479
-						if buffer[position] != rune('T') {
-							goto l469
-						}
-						position++
-					}
-				l479:
-					if buffer[position] != rune(':') {
-						goto l469
-					}
-					position++
-					if !_rules[ruleSymbolName]() {
-						goto l469
-					}
+				if buffer[position] != rune('$') {
+					goto l469
 				}
-			l471:
-				add(ruleGOTSymbolOffset, position470)
+				position++
+				if buffer[position] != rune('_') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('G') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('L') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('O') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('B') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('A') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('L') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('_') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('O') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('F') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('F') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('S') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('E') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('T') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('_') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('T') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('A') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('B') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('L') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('E') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('_') {
+					goto l469
+				}
+				position++
+				if buffer[position] != rune('-') {
+					goto l469
+				}
+				position++
+				if !_rules[ruleLocalSymbol]() {
+					goto l469
+				}
+				add(ruleGOTLocation, position470)
 			}
 			return true
 		l469:
 			position, tokenIndex = position469, tokenIndex469
 			return false
 		},
-		/* 35 AVX512Token <- <(WS? '{' '%'? ([0-9] / [a-z])* '}')> */
+		/* 34 GOTSymbolOffset <- <(('$' SymbolName ('@' 'G' 'O' 'T') ('O' 'F' 'F')?) / (':' ('g' / 'G') ('o' / 'O') ('t' / 'T') ':' SymbolName))> */
 		func() bool {
-			position481, tokenIndex481 := position, tokenIndex
+			position471, tokenIndex471 := position, tokenIndex
 			{
-				position482 := position
+				position472 := position
 				{
-					position483, tokenIndex483 := position, tokenIndex
-					if !_rules[ruleWS]() {
-						goto l483
-					}
-					goto l484
-				l483:
-					position, tokenIndex = position483, tokenIndex483
-				}
-			l484:
-				if buffer[position] != rune('{') {
-					goto l481
-				}
-				position++
-				{
-					position485, tokenIndex485 := position, tokenIndex
-					if buffer[position] != rune('%') {
-						goto l485
+					position473, tokenIndex473 := position, tokenIndex
+					if buffer[position] != rune('$') {
+						goto l474
 					}
 					position++
+					if !_rules[ruleSymbolName]() {
+						goto l474
+					}
+					if buffer[position] != rune('@') {
+						goto l474
+					}
+					position++
+					if buffer[position] != rune('G') {
+						goto l474
+					}
+					position++
+					if buffer[position] != rune('O') {
+						goto l474
+					}
+					position++
+					if buffer[position] != rune('T') {
+						goto l474
+					}
+					position++
+					{
+						position475, tokenIndex475 := position, tokenIndex
+						if buffer[position] != rune('O') {
+							goto l475
+						}
+						position++
+						if buffer[position] != rune('F') {
+							goto l475
+						}
+						position++
+						if buffer[position] != rune('F') {
+							goto l475
+						}
+						position++
+						goto l476
+					l475:
+						position, tokenIndex = position475, tokenIndex475
+					}
+				l476:
+					goto l473
+				l474:
+					position, tokenIndex = position473, tokenIndex473
+					if buffer[position] != rune(':') {
+						goto l471
+					}
+					position++
+					{
+						position477, tokenIndex477 := position, tokenIndex
+						if buffer[position] != rune('g') {
+							goto l478
+						}
+						position++
+						goto l477
+					l478:
+						position, tokenIndex = position477, tokenIndex477
+						if buffer[position] != rune('G') {
+							goto l471
+						}
+						position++
+					}
+				l477:
+					{
+						position479, tokenIndex479 := position, tokenIndex
+						if buffer[position] != rune('o') {
+							goto l480
+						}
+						position++
+						goto l479
+					l480:
+						position, tokenIndex = position479, tokenIndex479
+						if buffer[position] != rune('O') {
+							goto l471
+						}
+						position++
+					}
+				l479:
+					{
+						position481, tokenIndex481 := position, tokenIndex
+						if buffer[position] != rune('t') {
+							goto l482
+						}
+						position++
+						goto l481
+					l482:
+						position, tokenIndex = position481, tokenIndex481
+						if buffer[position] != rune('T') {
+							goto l471
+						}
+						position++
+					}
+				l481:
+					if buffer[position] != rune(':') {
+						goto l471
+					}
+					position++
+					if !_rules[ruleSymbolName]() {
+						goto l471
+					}
+				}
+			l473:
+				add(ruleGOTSymbolOffset, position472)
+			}
+			return true
+		l471:
+			position, tokenIndex = position471, tokenIndex471
+			return false
+		},
+		/* 35 AVX512Token <- <(WS? '{' '%'? ([0-9] / [a-z])* '}')> */
+		func() bool {
+			position483, tokenIndex483 := position, tokenIndex
+			{
+				position484 := position
+				{
+					position485, tokenIndex485 := position, tokenIndex
+					if !_rules[ruleWS]() {
+						goto l485
+					}
 					goto l486
 				l485:
 					position, tokenIndex = position485, tokenIndex485
 				}
 			l486:
-			l487:
+				if buffer[position] != rune('{') {
+					goto l483
+				}
+				position++
 				{
-					position488, tokenIndex488 := position, tokenIndex
+					position487, tokenIndex487 := position, tokenIndex
+					if buffer[position] != rune('%') {
+						goto l487
+					}
+					position++
+					goto l488
+				l487:
+					position, tokenIndex = position487, tokenIndex487
+				}
+			l488:
+			l489:
+				{
+					position490, tokenIndex490 := position, tokenIndex
 					{
-						position489, tokenIndex489 := position, tokenIndex
+						position491, tokenIndex491 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l492
+						}
+						position++
+						goto l491
+					l492:
+						position, tokenIndex = position491, tokenIndex491
+						if c := buffer[position]; c < rune('a') || c > rune('z') {
 							goto l490
 						}
 						position++
-						goto l489
-					l490:
-						position, tokenIndex = position489, tokenIndex489
-						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l488
-						}
-						position++
 					}
-				l489:
-					goto l487
-				l488:
-					position, tokenIndex = position488, tokenIndex488
+				l491:
+					goto l489
+				l490:
+					position, tokenIndex = position490, tokenIndex490
 				}
 				if buffer[position] != rune('}') {
-					goto l481
+					goto l483
 				}
 				position++
-				add(ruleAVX512Token, position482)
+				add(ruleAVX512Token, position484)
 			}
 			return true
-		l481:
-			position, tokenIndex = position481, tokenIndex481
+		l483:
+			position, tokenIndex = position483, tokenIndex483
 			return false
 		},
 		/* 36 TOCRefHigh <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('h' / 'H') ('a' / 'A')))> */
 		func() bool {
-			position491, tokenIndex491 := position, tokenIndex
+			position493, tokenIndex493 := position, tokenIndex
 			{
-				position492 := position
+				position494 := position
 				if buffer[position] != rune('.') {
-					goto l491
+					goto l493
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l491
+					goto l493
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l491
+					goto l493
 				}
 				position++
 				if buffer[position] != rune('C') {
-					goto l491
+					goto l493
 				}
 				position++
 				if buffer[position] != rune('.') {
-					goto l491
+					goto l493
 				}
 				position++
 				if buffer[position] != rune('-') {
-					goto l491
+					goto l493
 				}
 				position++
 				{
-					position493, tokenIndex493 := position, tokenIndex
+					position495, tokenIndex495 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l494
+						goto l496
 					}
 					position++
 					if buffer[position] != rune('b') {
-						goto l494
+						goto l496
 					}
 					position++
-					goto l493
-				l494:
-					position, tokenIndex = position493, tokenIndex493
+					goto l495
+				l496:
+					position, tokenIndex = position495, tokenIndex495
 					if buffer[position] != rune('.') {
-						goto l491
+						goto l493
 					}
 					position++
 					if buffer[position] != rune('L') {
-						goto l491
+						goto l493
 					}
 					position++
 					{
-						position497, tokenIndex497 := position, tokenIndex
+						position499, tokenIndex499 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l498
-						}
-						position++
-						goto l497
-					l498:
-						position, tokenIndex = position497, tokenIndex497
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l499
-						}
-						position++
-						goto l497
-					l499:
-						position, tokenIndex = position497, tokenIndex497
-						if buffer[position] != rune('_') {
 							goto l500
 						}
 						position++
-						goto l497
+						goto l499
 					l500:
-						position, tokenIndex = position497, tokenIndex497
+						position, tokenIndex = position499, tokenIndex499
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
+							goto l501
+						}
+						position++
+						goto l499
+					l501:
+						position, tokenIndex = position499, tokenIndex499
+						if buffer[position] != rune('_') {
+							goto l502
+						}
+						position++
+						goto l499
+					l502:
+						position, tokenIndex = position499, tokenIndex499
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l491
+							goto l493
 						}
 						position++
 					}
+				l499:
 				l497:
-				l495:
 					{
-						position496, tokenIndex496 := position, tokenIndex
+						position498, tokenIndex498 := position, tokenIndex
 						{
-							position501, tokenIndex501 := position, tokenIndex
+							position503, tokenIndex503 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l502
-							}
-							position++
-							goto l501
-						l502:
-							position, tokenIndex = position501, tokenIndex501
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l503
-							}
-							position++
-							goto l501
-						l503:
-							position, tokenIndex = position501, tokenIndex501
-							if buffer[position] != rune('_') {
 								goto l504
 							}
 							position++
-							goto l501
+							goto l503
 						l504:
-							position, tokenIndex = position501, tokenIndex501
+							position, tokenIndex = position503, tokenIndex503
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
+								goto l505
+							}
+							position++
+							goto l503
+						l505:
+							position, tokenIndex = position503, tokenIndex503
+							if buffer[position] != rune('_') {
+								goto l506
+							}
+							position++
+							goto l503
+						l506:
+							position, tokenIndex = position503, tokenIndex503
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l496
+								goto l498
 							}
 							position++
 						}
-					l501:
-						goto l495
-					l496:
-						position, tokenIndex = position496, tokenIndex496
+					l503:
+						goto l497
+					l498:
+						position, tokenIndex = position498, tokenIndex498
 					}
 				}
-			l493:
+			l495:
 				if buffer[position] != rune('@') {
-					goto l491
+					goto l493
 				}
 				position++
 				{
-					position505, tokenIndex505 := position, tokenIndex
-					if buffer[position] != rune('h') {
-						goto l506
-					}
-					position++
-					goto l505
-				l506:
-					position, tokenIndex = position505, tokenIndex505
-					if buffer[position] != rune('H') {
-						goto l491
-					}
-					position++
-				}
-			l505:
-				{
 					position507, tokenIndex507 := position, tokenIndex
-					if buffer[position] != rune('a') {
+					if buffer[position] != rune('h') {
 						goto l508
 					}
 					position++
 					goto l507
 				l508:
 					position, tokenIndex = position507, tokenIndex507
-					if buffer[position] != rune('A') {
-						goto l491
+					if buffer[position] != rune('H') {
+						goto l493
 					}
 					position++
 				}
 			l507:
-				add(ruleTOCRefHigh, position492)
+				{
+					position509, tokenIndex509 := position, tokenIndex
+					if buffer[position] != rune('a') {
+						goto l510
+					}
+					position++
+					goto l509
+				l510:
+					position, tokenIndex = position509, tokenIndex509
+					if buffer[position] != rune('A') {
+						goto l493
+					}
+					position++
+				}
+			l509:
+				add(ruleTOCRefHigh, position494)
 			}
 			return true
-		l491:
-			position, tokenIndex = position491, tokenIndex491
+		l493:
+			position, tokenIndex = position493, tokenIndex493
 			return false
 		},
 		/* 37 TOCRefLow <- <('.' 'T' 'O' 'C' '.' '-' (('0' 'b') / ('.' 'L' ([a-z] / [A-Z] / '_' / [0-9])+)) ('@' ('l' / 'L')))> */
 		func() bool {
-			position509, tokenIndex509 := position, tokenIndex
+			position511, tokenIndex511 := position, tokenIndex
 			{
-				position510 := position
+				position512 := position
 				if buffer[position] != rune('.') {
-					goto l509
+					goto l511
 				}
 				position++
 				if buffer[position] != rune('T') {
-					goto l509
+					goto l511
 				}
 				position++
 				if buffer[position] != rune('O') {
-					goto l509
+					goto l511
 				}
 				position++
 				if buffer[position] != rune('C') {
-					goto l509
+					goto l511
 				}
 				position++
 				if buffer[position] != rune('.') {
-					goto l509
+					goto l511
 				}
 				position++
 				if buffer[position] != rune('-') {
-					goto l509
+					goto l511
 				}
 				position++
 				{
-					position511, tokenIndex511 := position, tokenIndex
+					position513, tokenIndex513 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l512
+						goto l514
 					}
 					position++
 					if buffer[position] != rune('b') {
-						goto l512
+						goto l514
 					}
 					position++
-					goto l511
-				l512:
-					position, tokenIndex = position511, tokenIndex511
+					goto l513
+				l514:
+					position, tokenIndex = position513, tokenIndex513
 					if buffer[position] != rune('.') {
-						goto l509
+						goto l511
 					}
 					position++
 					if buffer[position] != rune('L') {
-						goto l509
+						goto l511
 					}
 					position++
 					{
-						position515, tokenIndex515 := position, tokenIndex
+						position517, tokenIndex517 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l516
-						}
-						position++
-						goto l515
-					l516:
-						position, tokenIndex = position515, tokenIndex515
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l517
-						}
-						position++
-						goto l515
-					l517:
-						position, tokenIndex = position515, tokenIndex515
-						if buffer[position] != rune('_') {
 							goto l518
 						}
 						position++
-						goto l515
+						goto l517
 					l518:
-						position, tokenIndex = position515, tokenIndex515
+						position, tokenIndex = position517, tokenIndex517
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
+							goto l519
+						}
+						position++
+						goto l517
+					l519:
+						position, tokenIndex = position517, tokenIndex517
+						if buffer[position] != rune('_') {
+							goto l520
+						}
+						position++
+						goto l517
+					l520:
+						position, tokenIndex = position517, tokenIndex517
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l509
+							goto l511
 						}
 						position++
 					}
+				l517:
 				l515:
-				l513:
 					{
-						position514, tokenIndex514 := position, tokenIndex
+						position516, tokenIndex516 := position, tokenIndex
 						{
-							position519, tokenIndex519 := position, tokenIndex
+							position521, tokenIndex521 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l520
-							}
-							position++
-							goto l519
-						l520:
-							position, tokenIndex = position519, tokenIndex519
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l521
-							}
-							position++
-							goto l519
-						l521:
-							position, tokenIndex = position519, tokenIndex519
-							if buffer[position] != rune('_') {
 								goto l522
 							}
 							position++
-							goto l519
+							goto l521
 						l522:
-							position, tokenIndex = position519, tokenIndex519
+							position, tokenIndex = position521, tokenIndex521
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
+								goto l523
+							}
+							position++
+							goto l521
+						l523:
+							position, tokenIndex = position521, tokenIndex521
+							if buffer[position] != rune('_') {
+								goto l524
+							}
+							position++
+							goto l521
+						l524:
+							position, tokenIndex = position521, tokenIndex521
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l514
+								goto l516
 							}
 							position++
 						}
-					l519:
-						goto l513
-					l514:
-						position, tokenIndex = position514, tokenIndex514
+					l521:
+						goto l515
+					l516:
+						position, tokenIndex = position516, tokenIndex516
 					}
 				}
-			l511:
+			l513:
 				if buffer[position] != rune('@') {
-					goto l509
+					goto l511
 				}
 				position++
 				{
-					position523, tokenIndex523 := position, tokenIndex
+					position525, tokenIndex525 := position, tokenIndex
 					if buffer[position] != rune('l') {
-						goto l524
+						goto l526
 					}
 					position++
-					goto l523
-				l524:
-					position, tokenIndex = position523, tokenIndex523
+					goto l525
+				l526:
+					position, tokenIndex = position525, tokenIndex525
 					if buffer[position] != rune('L') {
-						goto l509
+						goto l511
 					}
 					position++
 				}
-			l523:
-				add(ruleTOCRefLow, position510)
+			l525:
+				add(ruleTOCRefLow, position512)
 			}
 			return true
-		l509:
-			position, tokenIndex = position509, tokenIndex509
+		l511:
+			position, tokenIndex = position511, tokenIndex511
 			return false
 		},
 		/* 38 IndirectionIndicator <- <'*'> */
 		func() bool {
-			position525, tokenIndex525 := position, tokenIndex
+			position527, tokenIndex527 := position, tokenIndex
 			{
-				position526 := position
+				position528 := position
 				if buffer[position] != rune('*') {
-					goto l525
+					goto l527
 				}
 				position++
-				add(ruleIndirectionIndicator, position526)
+				add(ruleIndirectionIndicator, position528)
 			}
 			return true
-		l525:
-			position, tokenIndex = position525, tokenIndex525
+		l527:
+			position, tokenIndex = position527, tokenIndex527
 			return false
 		},
 		/* 39 RegisterOrConstant <- <((('%' ([a-z] / [A-Z]) ([a-z] / [A-Z] / ([0-9] / [0-9]))*) / ('$'? ((Offset Offset) / Offset)) / ('#' Offset ('*' [0-9]+ ('-' [0-9] [0-9]*)?)?) / ('#' '~'? '(' [0-9] WS? ('<' '<') WS? [0-9] ')') / ARMRegister) !('f' / 'b' / ':' / '(' / '+' / '-'))> */
 		func() bool {
-			position527, tokenIndex527 := position, tokenIndex
+			position529, tokenIndex529 := position, tokenIndex
 			{
-				position528 := position
+				position530 := position
 				{
-					position529, tokenIndex529 := position, tokenIndex
+					position531, tokenIndex531 := position, tokenIndex
 					if buffer[position] != rune('%') {
-						goto l530
+						goto l532
 					}
 					position++
 					{
-						position531, tokenIndex531 := position, tokenIndex
+						position533, tokenIndex533 := position, tokenIndex
 						if c := buffer[position]; c < rune('a') || c > rune('z') {
+							goto l534
+						}
+						position++
+						goto l533
+					l534:
+						position, tokenIndex = position533, tokenIndex533
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
 							goto l532
 						}
 						position++
-						goto l531
-					l532:
-						position, tokenIndex = position531, tokenIndex531
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l530
-						}
-						position++
 					}
-				l531:
 				l533:
+				l535:
 					{
-						position534, tokenIndex534 := position, tokenIndex
+						position536, tokenIndex536 := position, tokenIndex
 						{
-							position535, tokenIndex535 := position, tokenIndex
+							position537, tokenIndex537 := position, tokenIndex
 							if c := buffer[position]; c < rune('a') || c > rune('z') {
-								goto l536
-							}
-							position++
-							goto l535
-						l536:
-							position, tokenIndex = position535, tokenIndex535
-							if c := buffer[position]; c < rune('A') || c > rune('Z') {
-								goto l537
-							}
-							position++
-							goto l535
-						l537:
-							position, tokenIndex = position535, tokenIndex535
-							{
-								position538, tokenIndex538 := position, tokenIndex
-								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l539
-								}
-								position++
 								goto l538
-							l539:
-								position, tokenIndex = position538, tokenIndex538
+							}
+							position++
+							goto l537
+						l538:
+							position, tokenIndex = position537, tokenIndex537
+							if c := buffer[position]; c < rune('A') || c > rune('Z') {
+								goto l539
+							}
+							position++
+							goto l537
+						l539:
+							position, tokenIndex = position537, tokenIndex537
+							{
+								position540, tokenIndex540 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l534
+									goto l541
+								}
+								position++
+								goto l540
+							l541:
+								position, tokenIndex = position540, tokenIndex540
+								if c := buffer[position]; c < rune('0') || c > rune('9') {
+									goto l536
 								}
 								position++
 							}
-						l538:
+						l540:
 						}
-					l535:
-						goto l533
-					l534:
-						position, tokenIndex = position534, tokenIndex534
+					l537:
+						goto l535
+					l536:
+						position, tokenIndex = position536, tokenIndex536
 					}
-					goto l529
-				l530:
-					position, tokenIndex = position529, tokenIndex529
-					{
-						position541, tokenIndex541 := position, tokenIndex
-						if buffer[position] != rune('$') {
-							goto l541
-						}
-						position++
-						goto l542
-					l541:
-						position, tokenIndex = position541, tokenIndex541
-					}
-				l542:
+					goto l531
+				l532:
+					position, tokenIndex = position531, tokenIndex531
 					{
 						position543, tokenIndex543 := position, tokenIndex
-						if !_rules[ruleOffset]() {
-							goto l544
+						if buffer[position] != rune('$') {
+							goto l543
 						}
-						if !_rules[ruleOffset]() {
-							goto l544
-						}
-						goto l543
-					l544:
+						position++
+						goto l544
+					l543:
 						position, tokenIndex = position543, tokenIndex543
+					}
+				l544:
+					{
+						position545, tokenIndex545 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l540
+							goto l546
+						}
+						if !_rules[ruleOffset]() {
+							goto l546
+						}
+						goto l545
+					l546:
+						position, tokenIndex = position545, tokenIndex545
+						if !_rules[ruleOffset]() {
+							goto l542
 						}
 					}
-				l543:
-					goto l529
-				l540:
-					position, tokenIndex = position529, tokenIndex529
+				l545:
+					goto l531
+				l542:
+					position, tokenIndex = position531, tokenIndex531
 					if buffer[position] != rune('#') {
-						goto l545
+						goto l547
 					}
 					position++
 					if !_rules[ruleOffset]() {
-						goto l545
+						goto l547
 					}
 					{
-						position546, tokenIndex546 := position, tokenIndex
+						position548, tokenIndex548 := position, tokenIndex
 						if buffer[position] != rune('*') {
-							goto l546
+							goto l548
 						}
 						position++
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l546
+							goto l548
 						}
 						position++
-					l548:
+					l550:
 						{
-							position549, tokenIndex549 := position, tokenIndex
+							position551, tokenIndex551 := position, tokenIndex
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l549
+								goto l551
 							}
 							position++
-							goto l548
-						l549:
-							position, tokenIndex = position549, tokenIndex549
+							goto l550
+						l551:
+							position, tokenIndex = position551, tokenIndex551
 						}
 						{
-							position550, tokenIndex550 := position, tokenIndex
+							position552, tokenIndex552 := position, tokenIndex
 							if buffer[position] != rune('-') {
-								goto l550
+								goto l552
 							}
 							position++
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l550
+								goto l552
 							}
 							position++
-						l552:
+						l554:
 							{
-								position553, tokenIndex553 := position, tokenIndex
+								position555, tokenIndex555 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l553
+									goto l555
 								}
 								position++
-								goto l552
-							l553:
-								position, tokenIndex = position553, tokenIndex553
+								goto l554
+							l555:
+								position, tokenIndex = position555, tokenIndex555
 							}
-							goto l551
-						l550:
-							position, tokenIndex = position550, tokenIndex550
+							goto l553
+						l552:
+							position, tokenIndex = position552, tokenIndex552
 						}
-					l551:
-						goto l547
-					l546:
-						position, tokenIndex = position546, tokenIndex546
+					l553:
+						goto l549
+					l548:
+						position, tokenIndex = position548, tokenIndex548
 					}
+				l549:
+					goto l531
 				l547:
-					goto l529
-				l545:
-					position, tokenIndex = position529, tokenIndex529
+					position, tokenIndex = position531, tokenIndex531
 					if buffer[position] != rune('#') {
-						goto l554
-					}
-					position++
-					{
-						position555, tokenIndex555 := position, tokenIndex
-						if buffer[position] != rune('~') {
-							goto l555
-						}
-						position++
 						goto l556
-					l555:
-						position, tokenIndex = position555, tokenIndex555
-					}
-				l556:
-					if buffer[position] != rune('(') {
-						goto l554
-					}
-					position++
-					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l554
 					}
 					position++
 					{
 						position557, tokenIndex557 := position, tokenIndex
-						if !_rules[ruleWS]() {
+						if buffer[position] != rune('~') {
 							goto l557
 						}
+						position++
 						goto l558
 					l557:
 						position, tokenIndex = position557, tokenIndex557
 					}
 				l558:
-					if buffer[position] != rune('<') {
-						goto l554
+					if buffer[position] != rune('(') {
+						goto l556
 					}
 					position++
-					if buffer[position] != rune('<') {
-						goto l554
+					if c := buffer[position]; c < rune('0') || c > rune('9') {
+						goto l556
 					}
 					position++
 					{
@@ -4482,1506 +4471,1514 @@
 						position, tokenIndex = position559, tokenIndex559
 					}
 				l560:
+					if buffer[position] != rune('<') {
+						goto l556
+					}
+					position++
+					if buffer[position] != rune('<') {
+						goto l556
+					}
+					position++
+					{
+						position561, tokenIndex561 := position, tokenIndex
+						if !_rules[ruleWS]() {
+							goto l561
+						}
+						goto l562
+					l561:
+						position, tokenIndex = position561, tokenIndex561
+					}
+				l562:
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l554
+						goto l556
 					}
 					position++
 					if buffer[position] != rune(')') {
-						goto l554
+						goto l556
 					}
 					position++
-					goto l529
-				l554:
-					position, tokenIndex = position529, tokenIndex529
+					goto l531
+				l556:
+					position, tokenIndex = position531, tokenIndex531
 					if !_rules[ruleARMRegister]() {
-						goto l527
+						goto l529
 					}
 				}
-			l529:
+			l531:
 				{
-					position561, tokenIndex561 := position, tokenIndex
+					position563, tokenIndex563 := position, tokenIndex
 					{
-						position562, tokenIndex562 := position, tokenIndex
+						position564, tokenIndex564 := position, tokenIndex
 						if buffer[position] != rune('f') {
-							goto l563
-						}
-						position++
-						goto l562
-					l563:
-						position, tokenIndex = position562, tokenIndex562
-						if buffer[position] != rune('b') {
-							goto l564
-						}
-						position++
-						goto l562
-					l564:
-						position, tokenIndex = position562, tokenIndex562
-						if buffer[position] != rune(':') {
 							goto l565
 						}
 						position++
-						goto l562
+						goto l564
 					l565:
-						position, tokenIndex = position562, tokenIndex562
-						if buffer[position] != rune('(') {
+						position, tokenIndex = position564, tokenIndex564
+						if buffer[position] != rune('b') {
 							goto l566
 						}
 						position++
-						goto l562
+						goto l564
 					l566:
-						position, tokenIndex = position562, tokenIndex562
-						if buffer[position] != rune('+') {
+						position, tokenIndex = position564, tokenIndex564
+						if buffer[position] != rune(':') {
 							goto l567
 						}
 						position++
-						goto l562
+						goto l564
 					l567:
-						position, tokenIndex = position562, tokenIndex562
+						position, tokenIndex = position564, tokenIndex564
+						if buffer[position] != rune('(') {
+							goto l568
+						}
+						position++
+						goto l564
+					l568:
+						position, tokenIndex = position564, tokenIndex564
+						if buffer[position] != rune('+') {
+							goto l569
+						}
+						position++
+						goto l564
+					l569:
+						position, tokenIndex = position564, tokenIndex564
 						if buffer[position] != rune('-') {
-							goto l561
+							goto l563
 						}
 						position++
 					}
-				l562:
-					goto l527
-				l561:
-					position, tokenIndex = position561, tokenIndex561
+				l564:
+					goto l529
+				l563:
+					position, tokenIndex = position563, tokenIndex563
 				}
-				add(ruleRegisterOrConstant, position528)
+				add(ruleRegisterOrConstant, position530)
 			}
 			return true
-		l527:
-			position, tokenIndex = position527, tokenIndex527
+		l529:
+			position, tokenIndex = position529, tokenIndex529
 			return false
 		},
 		/* 40 ARMConstantTweak <- <(((('l' / 'L') ('s' / 'S') ('l' / 'L')) / (('s' / 'S') ('x' / 'X') ('t' / 'T') ('w' / 'W')) / (('s' / 'S') ('x' / 'X') ('t' / 'T') ('b' / 'B')) / (('u' / 'U') ('x' / 'X') ('t' / 'T') ('w' / 'W')) / (('u' / 'U') ('x' / 'X') ('t' / 'T') ('b' / 'B')) / (('l' / 'L') ('s' / 'S') ('r' / 'R')) / (('r' / 'R') ('o' / 'O') ('r' / 'R')) / (('a' / 'A') ('s' / 'S') ('r' / 'R'))) (WS '#' Offset)?)> */
 		func() bool {
-			position568, tokenIndex568 := position, tokenIndex
+			position570, tokenIndex570 := position, tokenIndex
 			{
-				position569 := position
+				position571 := position
 				{
-					position570, tokenIndex570 := position, tokenIndex
-					{
-						position572, tokenIndex572 := position, tokenIndex
-						if buffer[position] != rune('l') {
-							goto l573
-						}
-						position++
-						goto l572
-					l573:
-						position, tokenIndex = position572, tokenIndex572
-						if buffer[position] != rune('L') {
-							goto l571
-						}
-						position++
-					}
-				l572:
+					position572, tokenIndex572 := position, tokenIndex
 					{
 						position574, tokenIndex574 := position, tokenIndex
-						if buffer[position] != rune('s') {
+						if buffer[position] != rune('l') {
 							goto l575
 						}
 						position++
 						goto l574
 					l575:
 						position, tokenIndex = position574, tokenIndex574
-						if buffer[position] != rune('S') {
-							goto l571
+						if buffer[position] != rune('L') {
+							goto l573
 						}
 						position++
 					}
 				l574:
 					{
 						position576, tokenIndex576 := position, tokenIndex
-						if buffer[position] != rune('l') {
+						if buffer[position] != rune('s') {
 							goto l577
 						}
 						position++
 						goto l576
 					l577:
 						position, tokenIndex = position576, tokenIndex576
-						if buffer[position] != rune('L') {
-							goto l571
+						if buffer[position] != rune('S') {
+							goto l573
 						}
 						position++
 					}
 				l576:
-					goto l570
-				l571:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position579, tokenIndex579 := position, tokenIndex
-						if buffer[position] != rune('s') {
-							goto l580
+						position578, tokenIndex578 := position, tokenIndex
+						if buffer[position] != rune('l') {
+							goto l579
 						}
 						position++
-						goto l579
-					l580:
-						position, tokenIndex = position579, tokenIndex579
-						if buffer[position] != rune('S') {
-							goto l578
+						goto l578
+					l579:
+						position, tokenIndex = position578, tokenIndex578
+						if buffer[position] != rune('L') {
+							goto l573
 						}
 						position++
 					}
-				l579:
+				l578:
+					goto l572
+				l573:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position581, tokenIndex581 := position, tokenIndex
-						if buffer[position] != rune('x') {
+						if buffer[position] != rune('s') {
 							goto l582
 						}
 						position++
 						goto l581
 					l582:
 						position, tokenIndex = position581, tokenIndex581
-						if buffer[position] != rune('X') {
-							goto l578
+						if buffer[position] != rune('S') {
+							goto l580
 						}
 						position++
 					}
 				l581:
 					{
 						position583, tokenIndex583 := position, tokenIndex
-						if buffer[position] != rune('t') {
+						if buffer[position] != rune('x') {
 							goto l584
 						}
 						position++
 						goto l583
 					l584:
 						position, tokenIndex = position583, tokenIndex583
-						if buffer[position] != rune('T') {
-							goto l578
+						if buffer[position] != rune('X') {
+							goto l580
 						}
 						position++
 					}
 				l583:
 					{
 						position585, tokenIndex585 := position, tokenIndex
-						if buffer[position] != rune('w') {
+						if buffer[position] != rune('t') {
 							goto l586
 						}
 						position++
 						goto l585
 					l586:
 						position, tokenIndex = position585, tokenIndex585
-						if buffer[position] != rune('W') {
-							goto l578
+						if buffer[position] != rune('T') {
+							goto l580
 						}
 						position++
 					}
 				l585:
-					goto l570
-				l578:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position588, tokenIndex588 := position, tokenIndex
-						if buffer[position] != rune('s') {
-							goto l589
+						position587, tokenIndex587 := position, tokenIndex
+						if buffer[position] != rune('w') {
+							goto l588
 						}
 						position++
-						goto l588
-					l589:
-						position, tokenIndex = position588, tokenIndex588
-						if buffer[position] != rune('S') {
-							goto l587
+						goto l587
+					l588:
+						position, tokenIndex = position587, tokenIndex587
+						if buffer[position] != rune('W') {
+							goto l580
 						}
 						position++
 					}
-				l588:
+				l587:
+					goto l572
+				l580:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position590, tokenIndex590 := position, tokenIndex
-						if buffer[position] != rune('x') {
+						if buffer[position] != rune('s') {
 							goto l591
 						}
 						position++
 						goto l590
 					l591:
 						position, tokenIndex = position590, tokenIndex590
-						if buffer[position] != rune('X') {
-							goto l587
+						if buffer[position] != rune('S') {
+							goto l589
 						}
 						position++
 					}
 				l590:
 					{
 						position592, tokenIndex592 := position, tokenIndex
-						if buffer[position] != rune('t') {
+						if buffer[position] != rune('x') {
 							goto l593
 						}
 						position++
 						goto l592
 					l593:
 						position, tokenIndex = position592, tokenIndex592
-						if buffer[position] != rune('T') {
-							goto l587
+						if buffer[position] != rune('X') {
+							goto l589
 						}
 						position++
 					}
 				l592:
 					{
 						position594, tokenIndex594 := position, tokenIndex
-						if buffer[position] != rune('b') {
+						if buffer[position] != rune('t') {
 							goto l595
 						}
 						position++
 						goto l594
 					l595:
 						position, tokenIndex = position594, tokenIndex594
-						if buffer[position] != rune('B') {
-							goto l587
+						if buffer[position] != rune('T') {
+							goto l589
 						}
 						position++
 					}
 				l594:
-					goto l570
-				l587:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position597, tokenIndex597 := position, tokenIndex
-						if buffer[position] != rune('u') {
-							goto l598
+						position596, tokenIndex596 := position, tokenIndex
+						if buffer[position] != rune('b') {
+							goto l597
 						}
 						position++
-						goto l597
-					l598:
-						position, tokenIndex = position597, tokenIndex597
-						if buffer[position] != rune('U') {
-							goto l596
+						goto l596
+					l597:
+						position, tokenIndex = position596, tokenIndex596
+						if buffer[position] != rune('B') {
+							goto l589
 						}
 						position++
 					}
-				l597:
+				l596:
+					goto l572
+				l589:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position599, tokenIndex599 := position, tokenIndex
-						if buffer[position] != rune('x') {
+						if buffer[position] != rune('u') {
 							goto l600
 						}
 						position++
 						goto l599
 					l600:
 						position, tokenIndex = position599, tokenIndex599
-						if buffer[position] != rune('X') {
-							goto l596
+						if buffer[position] != rune('U') {
+							goto l598
 						}
 						position++
 					}
 				l599:
 					{
 						position601, tokenIndex601 := position, tokenIndex
-						if buffer[position] != rune('t') {
+						if buffer[position] != rune('x') {
 							goto l602
 						}
 						position++
 						goto l601
 					l602:
 						position, tokenIndex = position601, tokenIndex601
-						if buffer[position] != rune('T') {
-							goto l596
+						if buffer[position] != rune('X') {
+							goto l598
 						}
 						position++
 					}
 				l601:
 					{
 						position603, tokenIndex603 := position, tokenIndex
-						if buffer[position] != rune('w') {
+						if buffer[position] != rune('t') {
 							goto l604
 						}
 						position++
 						goto l603
 					l604:
 						position, tokenIndex = position603, tokenIndex603
-						if buffer[position] != rune('W') {
-							goto l596
+						if buffer[position] != rune('T') {
+							goto l598
 						}
 						position++
 					}
 				l603:
-					goto l570
-				l596:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position606, tokenIndex606 := position, tokenIndex
-						if buffer[position] != rune('u') {
-							goto l607
+						position605, tokenIndex605 := position, tokenIndex
+						if buffer[position] != rune('w') {
+							goto l606
 						}
 						position++
-						goto l606
-					l607:
-						position, tokenIndex = position606, tokenIndex606
-						if buffer[position] != rune('U') {
-							goto l605
+						goto l605
+					l606:
+						position, tokenIndex = position605, tokenIndex605
+						if buffer[position] != rune('W') {
+							goto l598
 						}
 						position++
 					}
-				l606:
+				l605:
+					goto l572
+				l598:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position608, tokenIndex608 := position, tokenIndex
-						if buffer[position] != rune('x') {
+						if buffer[position] != rune('u') {
 							goto l609
 						}
 						position++
 						goto l608
 					l609:
 						position, tokenIndex = position608, tokenIndex608
-						if buffer[position] != rune('X') {
-							goto l605
+						if buffer[position] != rune('U') {
+							goto l607
 						}
 						position++
 					}
 				l608:
 					{
 						position610, tokenIndex610 := position, tokenIndex
-						if buffer[position] != rune('t') {
+						if buffer[position] != rune('x') {
 							goto l611
 						}
 						position++
 						goto l610
 					l611:
 						position, tokenIndex = position610, tokenIndex610
-						if buffer[position] != rune('T') {
-							goto l605
+						if buffer[position] != rune('X') {
+							goto l607
 						}
 						position++
 					}
 				l610:
 					{
 						position612, tokenIndex612 := position, tokenIndex
-						if buffer[position] != rune('b') {
+						if buffer[position] != rune('t') {
 							goto l613
 						}
 						position++
 						goto l612
 					l613:
 						position, tokenIndex = position612, tokenIndex612
-						if buffer[position] != rune('B') {
-							goto l605
+						if buffer[position] != rune('T') {
+							goto l607
 						}
 						position++
 					}
 				l612:
-					goto l570
-				l605:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position615, tokenIndex615 := position, tokenIndex
-						if buffer[position] != rune('l') {
-							goto l616
+						position614, tokenIndex614 := position, tokenIndex
+						if buffer[position] != rune('b') {
+							goto l615
 						}
 						position++
-						goto l615
-					l616:
-						position, tokenIndex = position615, tokenIndex615
-						if buffer[position] != rune('L') {
-							goto l614
+						goto l614
+					l615:
+						position, tokenIndex = position614, tokenIndex614
+						if buffer[position] != rune('B') {
+							goto l607
 						}
 						position++
 					}
-				l615:
+				l614:
+					goto l572
+				l607:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position617, tokenIndex617 := position, tokenIndex
-						if buffer[position] != rune('s') {
+						if buffer[position] != rune('l') {
 							goto l618
 						}
 						position++
 						goto l617
 					l618:
 						position, tokenIndex = position617, tokenIndex617
-						if buffer[position] != rune('S') {
-							goto l614
+						if buffer[position] != rune('L') {
+							goto l616
 						}
 						position++
 					}
 				l617:
 					{
 						position619, tokenIndex619 := position, tokenIndex
-						if buffer[position] != rune('r') {
+						if buffer[position] != rune('s') {
 							goto l620
 						}
 						position++
 						goto l619
 					l620:
 						position, tokenIndex = position619, tokenIndex619
-						if buffer[position] != rune('R') {
-							goto l614
+						if buffer[position] != rune('S') {
+							goto l616
 						}
 						position++
 					}
 				l619:
-					goto l570
-				l614:
-					position, tokenIndex = position570, tokenIndex570
 					{
-						position622, tokenIndex622 := position, tokenIndex
+						position621, tokenIndex621 := position, tokenIndex
 						if buffer[position] != rune('r') {
-							goto l623
+							goto l622
 						}
 						position++
-						goto l622
-					l623:
-						position, tokenIndex = position622, tokenIndex622
+						goto l621
+					l622:
+						position, tokenIndex = position621, tokenIndex621
 						if buffer[position] != rune('R') {
-							goto l621
+							goto l616
 						}
 						position++
 					}
-				l622:
+				l621:
+					goto l572
+				l616:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position624, tokenIndex624 := position, tokenIndex
-						if buffer[position] != rune('o') {
+						if buffer[position] != rune('r') {
 							goto l625
 						}
 						position++
 						goto l624
 					l625:
 						position, tokenIndex = position624, tokenIndex624
-						if buffer[position] != rune('O') {
-							goto l621
+						if buffer[position] != rune('R') {
+							goto l623
 						}
 						position++
 					}
 				l624:
 					{
 						position626, tokenIndex626 := position, tokenIndex
-						if buffer[position] != rune('r') {
+						if buffer[position] != rune('o') {
 							goto l627
 						}
 						position++
 						goto l626
 					l627:
 						position, tokenIndex = position626, tokenIndex626
-						if buffer[position] != rune('R') {
-							goto l621
+						if buffer[position] != rune('O') {
+							goto l623
 						}
 						position++
 					}
 				l626:
-					goto l570
-				l621:
-					position, tokenIndex = position570, tokenIndex570
 					{
 						position628, tokenIndex628 := position, tokenIndex
-						if buffer[position] != rune('a') {
+						if buffer[position] != rune('r') {
 							goto l629
 						}
 						position++
 						goto l628
 					l629:
 						position, tokenIndex = position628, tokenIndex628
-						if buffer[position] != rune('A') {
-							goto l568
+						if buffer[position] != rune('R') {
+							goto l623
 						}
 						position++
 					}
 				l628:
+					goto l572
+				l623:
+					position, tokenIndex = position572, tokenIndex572
 					{
 						position630, tokenIndex630 := position, tokenIndex
-						if buffer[position] != rune('s') {
+						if buffer[position] != rune('a') {
 							goto l631
 						}
 						position++
 						goto l630
 					l631:
 						position, tokenIndex = position630, tokenIndex630
-						if buffer[position] != rune('S') {
-							goto l568
+						if buffer[position] != rune('A') {
+							goto l570
 						}
 						position++
 					}
 				l630:
 					{
 						position632, tokenIndex632 := position, tokenIndex
-						if buffer[position] != rune('r') {
+						if buffer[position] != rune('s') {
 							goto l633
 						}
 						position++
 						goto l632
 					l633:
 						position, tokenIndex = position632, tokenIndex632
-						if buffer[position] != rune('R') {
-							goto l568
+						if buffer[position] != rune('S') {
+							goto l570
 						}
 						position++
 					}
 				l632:
-				}
-			l570:
-				{
-					position634, tokenIndex634 := position, tokenIndex
-					if !_rules[ruleWS]() {
+					{
+						position634, tokenIndex634 := position, tokenIndex
+						if buffer[position] != rune('r') {
+							goto l635
+						}
+						position++
 						goto l634
+					l635:
+						position, tokenIndex = position634, tokenIndex634
+						if buffer[position] != rune('R') {
+							goto l570
+						}
+						position++
+					}
+				l634:
+				}
+			l572:
+				{
+					position636, tokenIndex636 := position, tokenIndex
+					if !_rules[ruleWS]() {
+						goto l636
 					}
 					if buffer[position] != rune('#') {
-						goto l634
+						goto l636
 					}
 					position++
 					if !_rules[ruleOffset]() {
-						goto l634
+						goto l636
 					}
-					goto l635
-				l634:
-					position, tokenIndex = position634, tokenIndex634
+					goto l637
+				l636:
+					position, tokenIndex = position636, tokenIndex636
 				}
-			l635:
-				add(ruleARMConstantTweak, position569)
+			l637:
+				add(ruleARMConstantTweak, position571)
 			}
 			return true
-		l568:
-			position, tokenIndex = position568, tokenIndex568
+		l570:
+			position, tokenIndex = position570, tokenIndex570
 			return false
 		},
 		/* 41 ARMRegister <- <((('s' / 'S') ('p' / 'P')) / (('x' / 'w' / 'd' / 'q' / 's') [0-9] [0-9]?) / (('x' / 'X') ('z' / 'Z') ('r' / 'R')) / (('w' / 'W') ('z' / 'Z') ('r' / 'R')) / ARMVectorRegister / ('{' WS? ARMVectorRegister (',' WS? ARMVectorRegister)* WS? '}' ('[' [0-9] [0-9]? ']')?))> */
 		func() bool {
-			position636, tokenIndex636 := position, tokenIndex
+			position638, tokenIndex638 := position, tokenIndex
 			{
-				position637 := position
+				position639 := position
 				{
-					position638, tokenIndex638 := position, tokenIndex
-					{
-						position640, tokenIndex640 := position, tokenIndex
-						if buffer[position] != rune('s') {
-							goto l641
-						}
-						position++
-						goto l640
-					l641:
-						position, tokenIndex = position640, tokenIndex640
-						if buffer[position] != rune('S') {
-							goto l639
-						}
-						position++
-					}
-				l640:
+					position640, tokenIndex640 := position, tokenIndex
 					{
 						position642, tokenIndex642 := position, tokenIndex
-						if buffer[position] != rune('p') {
+						if buffer[position] != rune('s') {
 							goto l643
 						}
 						position++
 						goto l642
 					l643:
 						position, tokenIndex = position642, tokenIndex642
-						if buffer[position] != rune('P') {
-							goto l639
+						if buffer[position] != rune('S') {
+							goto l641
 						}
 						position++
 					}
 				l642:
-					goto l638
-				l639:
-					position, tokenIndex = position638, tokenIndex638
 					{
-						position645, tokenIndex645 := position, tokenIndex
+						position644, tokenIndex644 := position, tokenIndex
+						if buffer[position] != rune('p') {
+							goto l645
+						}
+						position++
+						goto l644
+					l645:
+						position, tokenIndex = position644, tokenIndex644
+						if buffer[position] != rune('P') {
+							goto l641
+						}
+						position++
+					}
+				l644:
+					goto l640
+				l641:
+					position, tokenIndex = position640, tokenIndex640
+					{
+						position647, tokenIndex647 := position, tokenIndex
 						if buffer[position] != rune('x') {
-							goto l646
-						}
-						position++
-						goto l645
-					l646:
-						position, tokenIndex = position645, tokenIndex645
-						if buffer[position] != rune('w') {
-							goto l647
-						}
-						position++
-						goto l645
-					l647:
-						position, tokenIndex = position645, tokenIndex645
-						if buffer[position] != rune('d') {
 							goto l648
 						}
 						position++
-						goto l645
+						goto l647
 					l648:
-						position, tokenIndex = position645, tokenIndex645
-						if buffer[position] != rune('q') {
+						position, tokenIndex = position647, tokenIndex647
+						if buffer[position] != rune('w') {
 							goto l649
 						}
 						position++
-						goto l645
+						goto l647
 					l649:
-						position, tokenIndex = position645, tokenIndex645
-						if buffer[position] != rune('s') {
-							goto l644
-						}
-						position++
-					}
-				l645:
-					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l644
-					}
-					position++
-					{
-						position650, tokenIndex650 := position, tokenIndex
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
+						position, tokenIndex = position647, tokenIndex647
+						if buffer[position] != rune('d') {
 							goto l650
 						}
 						position++
-						goto l651
+						goto l647
 					l650:
-						position, tokenIndex = position650, tokenIndex650
-					}
-				l651:
-					goto l638
-				l644:
-					position, tokenIndex = position638, tokenIndex638
-					{
-						position653, tokenIndex653 := position, tokenIndex
-						if buffer[position] != rune('x') {
-							goto l654
+						position, tokenIndex = position647, tokenIndex647
+						if buffer[position] != rune('q') {
+							goto l651
 						}
 						position++
-						goto l653
-					l654:
-						position, tokenIndex = position653, tokenIndex653
-						if buffer[position] != rune('X') {
+						goto l647
+					l651:
+						position, tokenIndex = position647, tokenIndex647
+						if buffer[position] != rune('s') {
+							goto l646
+						}
+						position++
+					}
+				l647:
+					if c := buffer[position]; c < rune('0') || c > rune('9') {
+						goto l646
+					}
+					position++
+					{
+						position652, tokenIndex652 := position, tokenIndex
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
 							goto l652
 						}
 						position++
+						goto l653
+					l652:
+						position, tokenIndex = position652, tokenIndex652
 					}
 				l653:
+					goto l640
+				l646:
+					position, tokenIndex = position640, tokenIndex640
 					{
 						position655, tokenIndex655 := position, tokenIndex
-						if buffer[position] != rune('z') {
+						if buffer[position] != rune('x') {
 							goto l656
 						}
 						position++
 						goto l655
 					l656:
 						position, tokenIndex = position655, tokenIndex655
-						if buffer[position] != rune('Z') {
-							goto l652
+						if buffer[position] != rune('X') {
+							goto l654
 						}
 						position++
 					}
 				l655:
 					{
 						position657, tokenIndex657 := position, tokenIndex
-						if buffer[position] != rune('r') {
+						if buffer[position] != rune('z') {
 							goto l658
 						}
 						position++
 						goto l657
 					l658:
 						position, tokenIndex = position657, tokenIndex657
-						if buffer[position] != rune('R') {
-							goto l652
+						if buffer[position] != rune('Z') {
+							goto l654
 						}
 						position++
 					}
 				l657:
-					goto l638
-				l652:
-					position, tokenIndex = position638, tokenIndex638
 					{
-						position660, tokenIndex660 := position, tokenIndex
-						if buffer[position] != rune('w') {
-							goto l661
+						position659, tokenIndex659 := position, tokenIndex
+						if buffer[position] != rune('r') {
+							goto l660
 						}
 						position++
-						goto l660
-					l661:
-						position, tokenIndex = position660, tokenIndex660
-						if buffer[position] != rune('W') {
-							goto l659
+						goto l659
+					l660:
+						position, tokenIndex = position659, tokenIndex659
+						if buffer[position] != rune('R') {
+							goto l654
 						}
 						position++
 					}
-				l660:
+				l659:
+					goto l640
+				l654:
+					position, tokenIndex = position640, tokenIndex640
 					{
 						position662, tokenIndex662 := position, tokenIndex
-						if buffer[position] != rune('z') {
+						if buffer[position] != rune('w') {
 							goto l663
 						}
 						position++
 						goto l662
 					l663:
 						position, tokenIndex = position662, tokenIndex662
-						if buffer[position] != rune('Z') {
-							goto l659
+						if buffer[position] != rune('W') {
+							goto l661
 						}
 						position++
 					}
 				l662:
 					{
 						position664, tokenIndex664 := position, tokenIndex
-						if buffer[position] != rune('r') {
+						if buffer[position] != rune('z') {
 							goto l665
 						}
 						position++
 						goto l664
 					l665:
 						position, tokenIndex = position664, tokenIndex664
-						if buffer[position] != rune('R') {
-							goto l659
+						if buffer[position] != rune('Z') {
+							goto l661
 						}
 						position++
 					}
 				l664:
-					goto l638
-				l659:
-					position, tokenIndex = position638, tokenIndex638
-					if !_rules[ruleARMVectorRegister]() {
-						goto l666
-					}
-					goto l638
-				l666:
-					position, tokenIndex = position638, tokenIndex638
-					if buffer[position] != rune('{') {
-						goto l636
-					}
-					position++
 					{
-						position667, tokenIndex667 := position, tokenIndex
-						if !_rules[ruleWS]() {
+						position666, tokenIndex666 := position, tokenIndex
+						if buffer[position] != rune('r') {
 							goto l667
 						}
-						goto l668
+						position++
+						goto l666
 					l667:
-						position, tokenIndex = position667, tokenIndex667
-					}
-				l668:
-					if !_rules[ruleARMVectorRegister]() {
-						goto l636
-					}
-				l669:
-					{
-						position670, tokenIndex670 := position, tokenIndex
-						if buffer[position] != rune(',') {
-							goto l670
+						position, tokenIndex = position666, tokenIndex666
+						if buffer[position] != rune('R') {
+							goto l661
 						}
 						position++
-						{
-							position671, tokenIndex671 := position, tokenIndex
-							if !_rules[ruleWS]() {
-								goto l671
-							}
-							goto l672
-						l671:
-							position, tokenIndex = position671, tokenIndex671
-						}
-					l672:
-						if !_rules[ruleARMVectorRegister]() {
-							goto l670
-						}
-						goto l669
-					l670:
-						position, tokenIndex = position670, tokenIndex670
 					}
-					{
-						position673, tokenIndex673 := position, tokenIndex
-						if !_rules[ruleWS]() {
-							goto l673
-						}
-						goto l674
-					l673:
-						position, tokenIndex = position673, tokenIndex673
+				l666:
+					goto l640
+				l661:
+					position, tokenIndex = position640, tokenIndex640
+					if !_rules[ruleARMVectorRegister]() {
+						goto l668
 					}
-				l674:
-					if buffer[position] != rune('}') {
-						goto l636
+					goto l640
+				l668:
+					position, tokenIndex = position640, tokenIndex640
+					if buffer[position] != rune('{') {
+						goto l638
 					}
 					position++
 					{
-						position675, tokenIndex675 := position, tokenIndex
-						if buffer[position] != rune('[') {
-							goto l675
+						position669, tokenIndex669 := position, tokenIndex
+						if !_rules[ruleWS]() {
+							goto l669
 						}
-						position++
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l675
+						goto l670
+					l669:
+						position, tokenIndex = position669, tokenIndex669
+					}
+				l670:
+					if !_rules[ruleARMVectorRegister]() {
+						goto l638
+					}
+				l671:
+					{
+						position672, tokenIndex672 := position, tokenIndex
+						if buffer[position] != rune(',') {
+							goto l672
 						}
 						position++
 						{
-							position677, tokenIndex677 := position, tokenIndex
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l677
+							position673, tokenIndex673 := position, tokenIndex
+							if !_rules[ruleWS]() {
+								goto l673
 							}
-							position++
-							goto l678
-						l677:
-							position, tokenIndex = position677, tokenIndex677
+							goto l674
+						l673:
+							position, tokenIndex = position673, tokenIndex673
 						}
-					l678:
-						if buffer[position] != rune(']') {
+					l674:
+						if !_rules[ruleARMVectorRegister]() {
+							goto l672
+						}
+						goto l671
+					l672:
+						position, tokenIndex = position672, tokenIndex672
+					}
+					{
+						position675, tokenIndex675 := position, tokenIndex
+						if !_rules[ruleWS]() {
 							goto l675
 						}
-						position++
 						goto l676
 					l675:
 						position, tokenIndex = position675, tokenIndex675
 					}
 				l676:
+					if buffer[position] != rune('}') {
+						goto l638
+					}
+					position++
+					{
+						position677, tokenIndex677 := position, tokenIndex
+						if buffer[position] != rune('[') {
+							goto l677
+						}
+						position++
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l677
+						}
+						position++
+						{
+							position679, tokenIndex679 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l679
+							}
+							position++
+							goto l680
+						l679:
+							position, tokenIndex = position679, tokenIndex679
+						}
+					l680:
+						if buffer[position] != rune(']') {
+							goto l677
+						}
+						position++
+						goto l678
+					l677:
+						position, tokenIndex = position677, tokenIndex677
+					}
+				l678:
 				}
-			l638:
-				add(ruleARMRegister, position637)
+			l640:
+				add(ruleARMRegister, position639)
 			}
 			return true
-		l636:
-			position, tokenIndex = position636, tokenIndex636
+		l638:
+			position, tokenIndex = position638, tokenIndex638
 			return false
 		},
 		/* 42 ARMVectorRegister <- <(('v' / 'V') [0-9] [0-9]? ('.' [0-9]* ('b' / 's' / 'd' / 'h' / 'q') ('[' [0-9] [0-9]? ']')?)?)> */
 		func() bool {
-			position679, tokenIndex679 := position, tokenIndex
+			position681, tokenIndex681 := position, tokenIndex
 			{
-				position680 := position
+				position682 := position
 				{
-					position681, tokenIndex681 := position, tokenIndex
+					position683, tokenIndex683 := position, tokenIndex
 					if buffer[position] != rune('v') {
-						goto l682
+						goto l684
 					}
 					position++
-					goto l681
-				l682:
-					position, tokenIndex = position681, tokenIndex681
+					goto l683
+				l684:
+					position, tokenIndex = position683, tokenIndex683
 					if buffer[position] != rune('V') {
-						goto l679
+						goto l681
 					}
 					position++
 				}
-			l681:
+			l683:
 				if c := buffer[position]; c < rune('0') || c > rune('9') {
-					goto l679
+					goto l681
 				}
 				position++
 				{
-					position683, tokenIndex683 := position, tokenIndex
-					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l683
-					}
-					position++
-					goto l684
-				l683:
-					position, tokenIndex = position683, tokenIndex683
-				}
-			l684:
-				{
 					position685, tokenIndex685 := position, tokenIndex
-					if buffer[position] != rune('.') {
+					if c := buffer[position]; c < rune('0') || c > rune('9') {
 						goto l685
 					}
 					position++
-				l687:
-					{
-						position688, tokenIndex688 := position, tokenIndex
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l688
-						}
-						position++
-						goto l687
-					l688:
-						position, tokenIndex = position688, tokenIndex688
-					}
-					{
-						position689, tokenIndex689 := position, tokenIndex
-						if buffer[position] != rune('b') {
-							goto l690
-						}
-						position++
-						goto l689
-					l690:
-						position, tokenIndex = position689, tokenIndex689
-						if buffer[position] != rune('s') {
-							goto l691
-						}
-						position++
-						goto l689
-					l691:
-						position, tokenIndex = position689, tokenIndex689
-						if buffer[position] != rune('d') {
-							goto l692
-						}
-						position++
-						goto l689
-					l692:
-						position, tokenIndex = position689, tokenIndex689
-						if buffer[position] != rune('h') {
-							goto l693
-						}
-						position++
-						goto l689
-					l693:
-						position, tokenIndex = position689, tokenIndex689
-						if buffer[position] != rune('q') {
-							goto l685
-						}
-						position++
-					}
-				l689:
-					{
-						position694, tokenIndex694 := position, tokenIndex
-						if buffer[position] != rune('[') {
-							goto l694
-						}
-						position++
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l694
-						}
-						position++
-						{
-							position696, tokenIndex696 := position, tokenIndex
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l696
-							}
-							position++
-							goto l697
-						l696:
-							position, tokenIndex = position696, tokenIndex696
-						}
-					l697:
-						if buffer[position] != rune(']') {
-							goto l694
-						}
-						position++
-						goto l695
-					l694:
-						position, tokenIndex = position694, tokenIndex694
-					}
-				l695:
 					goto l686
 				l685:
 					position, tokenIndex = position685, tokenIndex685
 				}
 			l686:
-				add(ruleARMVectorRegister, position680)
+				{
+					position687, tokenIndex687 := position, tokenIndex
+					if buffer[position] != rune('.') {
+						goto l687
+					}
+					position++
+				l689:
+					{
+						position690, tokenIndex690 := position, tokenIndex
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l690
+						}
+						position++
+						goto l689
+					l690:
+						position, tokenIndex = position690, tokenIndex690
+					}
+					{
+						position691, tokenIndex691 := position, tokenIndex
+						if buffer[position] != rune('b') {
+							goto l692
+						}
+						position++
+						goto l691
+					l692:
+						position, tokenIndex = position691, tokenIndex691
+						if buffer[position] != rune('s') {
+							goto l693
+						}
+						position++
+						goto l691
+					l693:
+						position, tokenIndex = position691, tokenIndex691
+						if buffer[position] != rune('d') {
+							goto l694
+						}
+						position++
+						goto l691
+					l694:
+						position, tokenIndex = position691, tokenIndex691
+						if buffer[position] != rune('h') {
+							goto l695
+						}
+						position++
+						goto l691
+					l695:
+						position, tokenIndex = position691, tokenIndex691
+						if buffer[position] != rune('q') {
+							goto l687
+						}
+						position++
+					}
+				l691:
+					{
+						position696, tokenIndex696 := position, tokenIndex
+						if buffer[position] != rune('[') {
+							goto l696
+						}
+						position++
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l696
+						}
+						position++
+						{
+							position698, tokenIndex698 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l698
+							}
+							position++
+							goto l699
+						l698:
+							position, tokenIndex = position698, tokenIndex698
+						}
+					l699:
+						if buffer[position] != rune(']') {
+							goto l696
+						}
+						position++
+						goto l697
+					l696:
+						position, tokenIndex = position696, tokenIndex696
+					}
+				l697:
+					goto l688
+				l687:
+					position, tokenIndex = position687, tokenIndex687
+				}
+			l688:
+				add(ruleARMVectorRegister, position682)
 			}
 			return true
-		l679:
-			position, tokenIndex = position679, tokenIndex679
+		l681:
+			position, tokenIndex = position681, tokenIndex681
 			return false
 		},
 		/* 43 MemoryRef <- <((SymbolRef BaseIndexScale) / SymbolRef / Low12BitsSymbolRef / (Offset* BaseIndexScale) / (SegmentRegister Offset BaseIndexScale) / (SegmentRegister BaseIndexScale) / (SegmentRegister Offset) / ARMBaseIndexScale / BaseIndexScale)> */
 		func() bool {
-			position698, tokenIndex698 := position, tokenIndex
+			position700, tokenIndex700 := position, tokenIndex
 			{
-				position699 := position
+				position701 := position
 				{
-					position700, tokenIndex700 := position, tokenIndex
+					position702, tokenIndex702 := position, tokenIndex
 					if !_rules[ruleSymbolRef]() {
-						goto l701
-					}
-					if !_rules[ruleBaseIndexScale]() {
-						goto l701
-					}
-					goto l700
-				l701:
-					position, tokenIndex = position700, tokenIndex700
-					if !_rules[ruleSymbolRef]() {
-						goto l702
-					}
-					goto l700
-				l702:
-					position, tokenIndex = position700, tokenIndex700
-					if !_rules[ruleLow12BitsSymbolRef]() {
 						goto l703
 					}
-					goto l700
-				l703:
-					position, tokenIndex = position700, tokenIndex700
-				l705:
-					{
-						position706, tokenIndex706 := position, tokenIndex
-						if !_rules[ruleOffset]() {
-							goto l706
-						}
-						goto l705
-					l706:
-						position, tokenIndex = position706, tokenIndex706
-					}
 					if !_rules[ruleBaseIndexScale]() {
+						goto l703
+					}
+					goto l702
+				l703:
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleSymbolRef]() {
 						goto l704
 					}
-					goto l700
+					goto l702
 				l704:
-					position, tokenIndex = position700, tokenIndex700
-					if !_rules[ruleSegmentRegister]() {
-						goto l707
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleLow12BitsSymbolRef]() {
+						goto l705
 					}
-					if !_rules[ruleOffset]() {
-						goto l707
-					}
-					if !_rules[ruleBaseIndexScale]() {
-						goto l707
-					}
-					goto l700
+					goto l702
+				l705:
+					position, tokenIndex = position702, tokenIndex702
 				l707:
-					position, tokenIndex = position700, tokenIndex700
-					if !_rules[ruleSegmentRegister]() {
-						goto l708
+					{
+						position708, tokenIndex708 := position, tokenIndex
+						if !_rules[ruleOffset]() {
+							goto l708
+						}
+						goto l707
+					l708:
+						position, tokenIndex = position708, tokenIndex708
 					}
 					if !_rules[ruleBaseIndexScale]() {
-						goto l708
+						goto l706
 					}
-					goto l700
-				l708:
-					position, tokenIndex = position700, tokenIndex700
+					goto l702
+				l706:
+					position, tokenIndex = position702, tokenIndex702
 					if !_rules[ruleSegmentRegister]() {
 						goto l709
 					}
 					if !_rules[ruleOffset]() {
 						goto l709
 					}
-					goto l700
+					if !_rules[ruleBaseIndexScale]() {
+						goto l709
+					}
+					goto l702
 				l709:
-					position, tokenIndex = position700, tokenIndex700
-					if !_rules[ruleARMBaseIndexScale]() {
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleSegmentRegister]() {
 						goto l710
 					}
-					goto l700
-				l710:
-					position, tokenIndex = position700, tokenIndex700
 					if !_rules[ruleBaseIndexScale]() {
-						goto l698
+						goto l710
+					}
+					goto l702
+				l710:
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleSegmentRegister]() {
+						goto l711
+					}
+					if !_rules[ruleOffset]() {
+						goto l711
+					}
+					goto l702
+				l711:
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleARMBaseIndexScale]() {
+						goto l712
+					}
+					goto l702
+				l712:
+					position, tokenIndex = position702, tokenIndex702
+					if !_rules[ruleBaseIndexScale]() {
+						goto l700
 					}
 				}
-			l700:
-				add(ruleMemoryRef, position699)
+			l702:
+				add(ruleMemoryRef, position701)
 			}
 			return true
-		l698:
-			position, tokenIndex = position698, tokenIndex698
+		l700:
+			position, tokenIndex = position700, tokenIndex700
 			return false
 		},
 		/* 44 SymbolRef <- <((Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?)> */
 		func() bool {
-			position711, tokenIndex711 := position, tokenIndex
+			position713, tokenIndex713 := position, tokenIndex
 			{
-				position712 := position
+				position714 := position
 				{
-					position713, tokenIndex713 := position, tokenIndex
-				l715:
+					position715, tokenIndex715 := position, tokenIndex
+				l717:
 					{
-						position716, tokenIndex716 := position, tokenIndex
+						position718, tokenIndex718 := position, tokenIndex
 						if !_rules[ruleOffset]() {
-							goto l716
+							goto l718
 						}
-						goto l715
-					l716:
-						position, tokenIndex = position716, tokenIndex716
+						goto l717
+					l718:
+						position, tokenIndex = position718, tokenIndex718
 					}
 					if buffer[position] != rune('+') {
-						goto l713
+						goto l715
 					}
 					position++
-					goto l714
-				l713:
-					position, tokenIndex = position713, tokenIndex713
+					goto l716
+				l715:
+					position, tokenIndex = position715, tokenIndex715
 				}
-			l714:
+			l716:
 				{
-					position717, tokenIndex717 := position, tokenIndex
+					position719, tokenIndex719 := position, tokenIndex
 					if !_rules[ruleLocalSymbol]() {
-						goto l718
-					}
-					goto l717
-				l718:
-					position, tokenIndex = position717, tokenIndex717
-					if !_rules[ruleSymbolName]() {
-						goto l711
-					}
-				}
-			l717:
-			l719:
-				{
-					position720, tokenIndex720 := position, tokenIndex
-					if !_rules[ruleOffset]() {
 						goto l720
 					}
 					goto l719
 				l720:
-					position, tokenIndex = position720, tokenIndex720
+					position, tokenIndex = position719, tokenIndex719
+					if !_rules[ruleSymbolName]() {
+						goto l713
+					}
+				}
+			l719:
+			l721:
+				{
+					position722, tokenIndex722 := position, tokenIndex
+					if !_rules[ruleOffset]() {
+						goto l722
+					}
+					goto l721
+				l722:
+					position, tokenIndex = position722, tokenIndex722
 				}
 				{
-					position721, tokenIndex721 := position, tokenIndex
+					position723, tokenIndex723 := position, tokenIndex
 					if buffer[position] != rune('@') {
-						goto l721
+						goto l723
 					}
 					position++
 					if !_rules[ruleSection]() {
-						goto l721
-					}
-				l723:
-					{
-						position724, tokenIndex724 := position, tokenIndex
-						if !_rules[ruleOffset]() {
-							goto l724
-						}
 						goto l723
-					l724:
-						position, tokenIndex = position724, tokenIndex724
 					}
-					goto l722
-				l721:
-					position, tokenIndex = position721, tokenIndex721
+				l725:
+					{
+						position726, tokenIndex726 := position, tokenIndex
+						if !_rules[ruleOffset]() {
+							goto l726
+						}
+						goto l725
+					l726:
+						position, tokenIndex = position726, tokenIndex726
+					}
+					goto l724
+				l723:
+					position, tokenIndex = position723, tokenIndex723
 				}
-			l722:
-				add(ruleSymbolRef, position712)
+			l724:
+				add(ruleSymbolRef, position714)
 			}
 			return true
-		l711:
-			position, tokenIndex = position711, tokenIndex711
+		l713:
+			position, tokenIndex = position713, tokenIndex713
 			return false
 		},
 		/* 45 Low12BitsSymbolRef <- <(':' ('l' / 'L') ('o' / 'O') '1' '2' ':' (LocalSymbol / SymbolName) Offset?)> */
 		func() bool {
-			position725, tokenIndex725 := position, tokenIndex
+			position727, tokenIndex727 := position, tokenIndex
 			{
-				position726 := position
+				position728 := position
 				if buffer[position] != rune(':') {
-					goto l725
+					goto l727
 				}
 				position++
 				{
-					position727, tokenIndex727 := position, tokenIndex
-					if buffer[position] != rune('l') {
-						goto l728
-					}
-					position++
-					goto l727
-				l728:
-					position, tokenIndex = position727, tokenIndex727
-					if buffer[position] != rune('L') {
-						goto l725
-					}
-					position++
-				}
-			l727:
-				{
 					position729, tokenIndex729 := position, tokenIndex
-					if buffer[position] != rune('o') {
+					if buffer[position] != rune('l') {
 						goto l730
 					}
 					position++
 					goto l729
 				l730:
 					position, tokenIndex = position729, tokenIndex729
-					if buffer[position] != rune('O') {
-						goto l725
+					if buffer[position] != rune('L') {
+						goto l727
 					}
 					position++
 				}
 			l729:
-				if buffer[position] != rune('1') {
-					goto l725
-				}
-				position++
-				if buffer[position] != rune('2') {
-					goto l725
-				}
-				position++
-				if buffer[position] != rune(':') {
-					goto l725
-				}
-				position++
 				{
 					position731, tokenIndex731 := position, tokenIndex
-					if !_rules[ruleLocalSymbol]() {
+					if buffer[position] != rune('o') {
 						goto l732
 					}
+					position++
 					goto l731
 				l732:
 					position, tokenIndex = position731, tokenIndex731
-					if !_rules[ruleSymbolName]() {
-						goto l725
+					if buffer[position] != rune('O') {
+						goto l727
 					}
+					position++
 				}
 			l731:
+				if buffer[position] != rune('1') {
+					goto l727
+				}
+				position++
+				if buffer[position] != rune('2') {
+					goto l727
+				}
+				position++
+				if buffer[position] != rune(':') {
+					goto l727
+				}
+				position++
 				{
 					position733, tokenIndex733 := position, tokenIndex
-					if !_rules[ruleOffset]() {
-						goto l733
+					if !_rules[ruleLocalSymbol]() {
+						goto l734
 					}
-					goto l734
-				l733:
+					goto l733
+				l734:
 					position, tokenIndex = position733, tokenIndex733
+					if !_rules[ruleSymbolName]() {
+						goto l727
+					}
 				}
-			l734:
-				add(ruleLow12BitsSymbolRef, position726)
+			l733:
+				{
+					position735, tokenIndex735 := position, tokenIndex
+					if !_rules[ruleOffset]() {
+						goto l735
+					}
+					goto l736
+				l735:
+					position, tokenIndex = position735, tokenIndex735
+				}
+			l736:
+				add(ruleLow12BitsSymbolRef, position728)
 			}
 			return true
-		l725:
-			position, tokenIndex = position725, tokenIndex725
+		l727:
+			position, tokenIndex = position727, tokenIndex727
 			return false
 		},
 		/* 46 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#' Offset ('*' [0-9]+)?) / ARMGOTLow12 / Low12BitsSymbolRef / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */
 		func() bool {
-			position735, tokenIndex735 := position, tokenIndex
+			position737, tokenIndex737 := position, tokenIndex
 			{
-				position736 := position
+				position738 := position
 				if buffer[position] != rune('[') {
-					goto l735
+					goto l737
 				}
 				position++
 				if !_rules[ruleARMRegister]() {
-					goto l735
+					goto l737
 				}
 				{
-					position737, tokenIndex737 := position, tokenIndex
+					position739, tokenIndex739 := position, tokenIndex
 					if buffer[position] != rune(',') {
-						goto l737
+						goto l739
 					}
 					position++
 					{
-						position739, tokenIndex739 := position, tokenIndex
-						if !_rules[ruleWS]() {
-							goto l739
-						}
-						goto l740
-					l739:
-						position, tokenIndex = position739, tokenIndex739
-					}
-				l740:
-					{
 						position741, tokenIndex741 := position, tokenIndex
+						if !_rules[ruleWS]() {
+							goto l741
+						}
+						goto l742
+					l741:
+						position, tokenIndex = position741, tokenIndex741
+					}
+				l742:
+					{
+						position743, tokenIndex743 := position, tokenIndex
 						if buffer[position] != rune('#') {
-							goto l742
+							goto l744
 						}
 						position++
 						if !_rules[ruleOffset]() {
-							goto l742
+							goto l744
 						}
 						{
-							position743, tokenIndex743 := position, tokenIndex
+							position745, tokenIndex745 := position, tokenIndex
 							if buffer[position] != rune('*') {
-								goto l743
+								goto l745
 							}
 							position++
 							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l743
+								goto l745
 							}
 							position++
-						l745:
+						l747:
 							{
-								position746, tokenIndex746 := position, tokenIndex
+								position748, tokenIndex748 := position, tokenIndex
 								if c := buffer[position]; c < rune('0') || c > rune('9') {
-									goto l746
+									goto l748
 								}
 								position++
-								goto l745
-							l746:
-								position, tokenIndex = position746, tokenIndex746
+								goto l747
+							l748:
+								position, tokenIndex = position748, tokenIndex748
 							}
-							goto l744
-						l743:
-							position, tokenIndex = position743, tokenIndex743
+							goto l746
+						l745:
+							position, tokenIndex = position745, tokenIndex745
 						}
+					l746:
+						goto l743
 					l744:
-						goto l741
-					l742:
-						position, tokenIndex = position741, tokenIndex741
+						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleARMGOTLow12]() {
-							goto l747
+							goto l749
 						}
-						goto l741
-					l747:
-						position, tokenIndex = position741, tokenIndex741
+						goto l743
+					l749:
+						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleLow12BitsSymbolRef]() {
-							goto l748
+							goto l750
 						}
-						goto l741
-					l748:
-						position, tokenIndex = position741, tokenIndex741
+						goto l743
+					l750:
+						position, tokenIndex = position743, tokenIndex743
 						if !_rules[ruleARMRegister]() {
-							goto l737
+							goto l739
 						}
 					}
-				l741:
+				l743:
 					{
-						position749, tokenIndex749 := position, tokenIndex
+						position751, tokenIndex751 := position, tokenIndex
 						if buffer[position] != rune(',') {
-							goto l749
+							goto l751
 						}
 						position++
 						{
-							position751, tokenIndex751 := position, tokenIndex
+							position753, tokenIndex753 := position, tokenIndex
 							if !_rules[ruleWS]() {
-								goto l751
+								goto l753
 							}
-							goto l752
-						l751:
-							position, tokenIndex = position751, tokenIndex751
+							goto l754
+						l753:
+							position, tokenIndex = position753, tokenIndex753
 						}
-					l752:
+					l754:
 						if !_rules[ruleARMConstantTweak]() {
-							goto l749
+							goto l751
 						}
-						goto l750
-					l749:
-						position, tokenIndex = position749, tokenIndex749
+						goto l752
+					l751:
+						position, tokenIndex = position751, tokenIndex751
 					}
-				l750:
-					goto l738
-				l737:
-					position, tokenIndex = position737, tokenIndex737
+				l752:
+					goto l740
+				l739:
+					position, tokenIndex = position739, tokenIndex739
 				}
-			l738:
+			l740:
 				if buffer[position] != rune(']') {
-					goto l735
+					goto l737
 				}
 				position++
 				{
-					position753, tokenIndex753 := position, tokenIndex
+					position755, tokenIndex755 := position, tokenIndex
 					if !_rules[ruleARMPostincrement]() {
-						goto l753
+						goto l755
 					}
-					goto l754
-				l753:
-					position, tokenIndex = position753, tokenIndex753
+					goto l756
+				l755:
+					position, tokenIndex = position755, tokenIndex755
 				}
-			l754:
-				add(ruleARMBaseIndexScale, position736)
+			l756:
+				add(ruleARMBaseIndexScale, position738)
 			}
 			return true
-		l735:
-			position, tokenIndex = position735, tokenIndex735
+		l737:
+			position, tokenIndex = position737, tokenIndex737
 			return false
 		},
 		/* 47 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */
 		func() bool {
-			position755, tokenIndex755 := position, tokenIndex
+			position757, tokenIndex757 := position, tokenIndex
 			{
-				position756 := position
+				position758 := position
 				if buffer[position] != rune(':') {
-					goto l755
+					goto l757
 				}
 				position++
 				{
-					position757, tokenIndex757 := position, tokenIndex
-					if buffer[position] != rune('g') {
-						goto l758
-					}
-					position++
-					goto l757
-				l758:
-					position, tokenIndex = position757, tokenIndex757
-					if buffer[position] != rune('G') {
-						goto l755
-					}
-					position++
-				}
-			l757:
-				{
 					position759, tokenIndex759 := position, tokenIndex
-					if buffer[position] != rune('o') {
+					if buffer[position] != rune('g') {
 						goto l760
 					}
 					position++
 					goto l759
 				l760:
 					position, tokenIndex = position759, tokenIndex759
-					if buffer[position] != rune('O') {
-						goto l755
+					if buffer[position] != rune('G') {
+						goto l757
 					}
 					position++
 				}
 			l759:
 				{
 					position761, tokenIndex761 := position, tokenIndex
-					if buffer[position] != rune('t') {
+					if buffer[position] != rune('o') {
 						goto l762
 					}
 					position++
 					goto l761
 				l762:
 					position, tokenIndex = position761, tokenIndex761
-					if buffer[position] != rune('T') {
-						goto l755
+					if buffer[position] != rune('O') {
+						goto l757
 					}
 					position++
 				}
 			l761:
-				if buffer[position] != rune('_') {
-					goto l755
-				}
-				position++
 				{
 					position763, tokenIndex763 := position, tokenIndex
-					if buffer[position] != rune('l') {
+					if buffer[position] != rune('t') {
 						goto l764
 					}
 					position++
 					goto l763
 				l764:
 					position, tokenIndex = position763, tokenIndex763
-					if buffer[position] != rune('L') {
-						goto l755
+					if buffer[position] != rune('T') {
+						goto l757
 					}
 					position++
 				}
 			l763:
+				if buffer[position] != rune('_') {
+					goto l757
+				}
+				position++
 				{
 					position765, tokenIndex765 := position, tokenIndex
-					if buffer[position] != rune('o') {
+					if buffer[position] != rune('l') {
 						goto l766
 					}
 					position++
 					goto l765
 				l766:
 					position, tokenIndex = position765, tokenIndex765
-					if buffer[position] != rune('O') {
-						goto l755
+					if buffer[position] != rune('L') {
+						goto l757
 					}
 					position++
 				}
 			l765:
+				{
+					position767, tokenIndex767 := position, tokenIndex
+					if buffer[position] != rune('o') {
+						goto l768
+					}
+					position++
+					goto l767
+				l768:
+					position, tokenIndex = position767, tokenIndex767
+					if buffer[position] != rune('O') {
+						goto l757
+					}
+					position++
+				}
+			l767:
 				if buffer[position] != rune('1') {
-					goto l755
+					goto l757
 				}
 				position++
 				if buffer[position] != rune('2') {
-					goto l755
+					goto l757
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l755
+					goto l757
 				}
 				position++
 				if !_rules[ruleSymbolName]() {
-					goto l755
+					goto l757
 				}
-				add(ruleARMGOTLow12, position756)
+				add(ruleARMGOTLow12, position758)
 			}
 			return true
-		l755:
-			position, tokenIndex = position755, tokenIndex755
+		l757:
+			position, tokenIndex = position757, tokenIndex757
 			return false
 		},
 		/* 48 ARMPostincrement <- <'!'> */
 		func() bool {
-			position767, tokenIndex767 := position, tokenIndex
+			position769, tokenIndex769 := position, tokenIndex
 			{
-				position768 := position
+				position770 := position
 				if buffer[position] != rune('!') {
-					goto l767
+					goto l769
 				}
 				position++
-				add(ruleARMPostincrement, position768)
+				add(ruleARMPostincrement, position770)
 			}
 			return true
-		l767:
-			position, tokenIndex = position767, tokenIndex767
+		l769:
+			position, tokenIndex = position769, tokenIndex769
 			return false
 		},
 		/* 49 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */
 		func() bool {
-			position769, tokenIndex769 := position, tokenIndex
+			position771, tokenIndex771 := position, tokenIndex
 			{
-				position770 := position
+				position772 := position
 				if buffer[position] != rune('(') {
-					goto l769
+					goto l771
 				}
 				position++
 				{
-					position771, tokenIndex771 := position, tokenIndex
-					if !_rules[ruleRegisterOrConstant]() {
-						goto l771
-					}
-					goto l772
-				l771:
-					position, tokenIndex = position771, tokenIndex771
-				}
-			l772:
-				{
 					position773, tokenIndex773 := position, tokenIndex
-					if !_rules[ruleWS]() {
+					if !_rules[ruleRegisterOrConstant]() {
 						goto l773
 					}
 					goto l774
@@ -5991,24 +5988,21 @@
 			l774:
 				{
 					position775, tokenIndex775 := position, tokenIndex
-					if buffer[position] != rune(',') {
+					if !_rules[ruleWS]() {
 						goto l775
 					}
+					goto l776
+				l775:
+					position, tokenIndex = position775, tokenIndex775
+				}
+			l776:
+				{
+					position777, tokenIndex777 := position, tokenIndex
+					if buffer[position] != rune(',') {
+						goto l777
+					}
 					position++
 					{
-						position777, tokenIndex777 := position, tokenIndex
-						if !_rules[ruleWS]() {
-							goto l777
-						}
-						goto l778
-					l777:
-						position, tokenIndex = position777, tokenIndex777
-					}
-				l778:
-					if !_rules[ruleRegisterOrConstant]() {
-						goto l775
-					}
-					{
 						position779, tokenIndex779 := position, tokenIndex
 						if !_rules[ruleWS]() {
 							goto l779
@@ -6018,94 +6012,96 @@
 						position, tokenIndex = position779, tokenIndex779
 					}
 				l780:
+					if !_rules[ruleRegisterOrConstant]() {
+						goto l777
+					}
 					{
 						position781, tokenIndex781 := position, tokenIndex
-						if buffer[position] != rune(',') {
+						if !_rules[ruleWS]() {
 							goto l781
 						}
-						position++
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l781
-						}
-						position++
-					l783:
-						{
-							position784, tokenIndex784 := position, tokenIndex
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l784
-							}
-							position++
-							goto l783
-						l784:
-							position, tokenIndex = position784, tokenIndex784
-						}
 						goto l782
 					l781:
 						position, tokenIndex = position781, tokenIndex781
 					}
 				l782:
-					goto l776
-				l775:
-					position, tokenIndex = position775, tokenIndex775
+					{
+						position783, tokenIndex783 := position, tokenIndex
+						if buffer[position] != rune(',') {
+							goto l783
+						}
+						position++
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l783
+						}
+						position++
+					l785:
+						{
+							position786, tokenIndex786 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l786
+							}
+							position++
+							goto l785
+						l786:
+							position, tokenIndex = position786, tokenIndex786
+						}
+						goto l784
+					l783:
+						position, tokenIndex = position783, tokenIndex783
+					}
+				l784:
+					goto l778
+				l777:
+					position, tokenIndex = position777, tokenIndex777
 				}
-			l776:
+			l778:
 				if buffer[position] != rune(')') {
-					goto l769
+					goto l771
 				}
 				position++
-				add(ruleBaseIndexScale, position770)
+				add(ruleBaseIndexScale, position772)
 			}
 			return true
-		l769:
-			position, tokenIndex = position769, tokenIndex769
+		l771:
+			position, tokenIndex = position771, tokenIndex771
 			return false
 		},
 		/* 50 Operator <- <('+' / '-')> */
 		func() bool {
-			position785, tokenIndex785 := position, tokenIndex
+			position787, tokenIndex787 := position, tokenIndex
 			{
-				position786 := position
+				position788 := position
 				{
-					position787, tokenIndex787 := position, tokenIndex
+					position789, tokenIndex789 := position, tokenIndex
 					if buffer[position] != rune('+') {
-						goto l788
+						goto l790
 					}
 					position++
-					goto l787
-				l788:
-					position, tokenIndex = position787, tokenIndex787
+					goto l789
+				l790:
+					position, tokenIndex = position789, tokenIndex789
 					if buffer[position] != rune('-') {
-						goto l785
+						goto l787
 					}
 					position++
 				}
-			l787:
-				add(ruleOperator, position786)
+			l789:
+				add(ruleOperator, position788)
 			}
 			return true
-		l785:
-			position, tokenIndex = position785, tokenIndex785
+		l787:
+			position, tokenIndex = position787, tokenIndex787
 			return false
 		},
 		/* 51 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / [0-9]+))> */
 		func() bool {
-			position789, tokenIndex789 := position, tokenIndex
+			position791, tokenIndex791 := position, tokenIndex
 			{
-				position790 := position
-				{
-					position791, tokenIndex791 := position, tokenIndex
-					if buffer[position] != rune('+') {
-						goto l791
-					}
-					position++
-					goto l792
-				l791:
-					position, tokenIndex = position791, tokenIndex791
-				}
-			l792:
+				position792 := position
 				{
 					position793, tokenIndex793 := position, tokenIndex
-					if buffer[position] != rune('-') {
+					if buffer[position] != rune('+') {
 						goto l793
 					}
 					position++
@@ -6116,284 +6112,295 @@
 			l794:
 				{
 					position795, tokenIndex795 := position, tokenIndex
+					if buffer[position] != rune('-') {
+						goto l795
+					}
+					position++
+					goto l796
+				l795:
+					position, tokenIndex = position795, tokenIndex795
+				}
+			l796:
+				{
+					position797, tokenIndex797 := position, tokenIndex
 					if buffer[position] != rune('0') {
-						goto l796
+						goto l798
 					}
 					position++
 					{
-						position797, tokenIndex797 := position, tokenIndex
+						position799, tokenIndex799 := position, tokenIndex
 						if buffer[position] != rune('b') {
+							goto l800
+						}
+						position++
+						goto l799
+					l800:
+						position, tokenIndex = position799, tokenIndex799
+						if buffer[position] != rune('B') {
 							goto l798
 						}
 						position++
-						goto l797
-					l798:
-						position, tokenIndex = position797, tokenIndex797
-						if buffer[position] != rune('B') {
-							goto l796
-						}
-						position++
 					}
-				l797:
-					{
-						position801, tokenIndex801 := position, tokenIndex
-						if buffer[position] != rune('0') {
-							goto l802
-						}
-						position++
-						goto l801
-					l802:
-						position, tokenIndex = position801, tokenIndex801
-						if buffer[position] != rune('1') {
-							goto l796
-						}
-						position++
-					}
-				l801:
 				l799:
 					{
-						position800, tokenIndex800 := position, tokenIndex
+						position803, tokenIndex803 := position, tokenIndex
+						if buffer[position] != rune('0') {
+							goto l804
+						}
+						position++
+						goto l803
+					l804:
+						position, tokenIndex = position803, tokenIndex803
+						if buffer[position] != rune('1') {
+							goto l798
+						}
+						position++
+					}
+				l803:
+				l801:
+					{
+						position802, tokenIndex802 := position, tokenIndex
 						{
-							position803, tokenIndex803 := position, tokenIndex
+							position805, tokenIndex805 := position, tokenIndex
 							if buffer[position] != rune('0') {
-								goto l804
+								goto l806
 							}
 							position++
-							goto l803
-						l804:
-							position, tokenIndex = position803, tokenIndex803
+							goto l805
+						l806:
+							position, tokenIndex = position805, tokenIndex805
 							if buffer[position] != rune('1') {
-								goto l800
+								goto l802
 							}
 							position++
 						}
-					l803:
-						goto l799
-					l800:
-						position, tokenIndex = position800, tokenIndex800
+					l805:
+						goto l801
+					l802:
+						position, tokenIndex = position802, tokenIndex802
 					}
-					goto l795
-				l796:
-					position, tokenIndex = position795, tokenIndex795
+					goto l797
+				l798:
+					position, tokenIndex = position797, tokenIndex797
 					if buffer[position] != rune('0') {
-						goto l805
+						goto l807
 					}
 					position++
 					{
-						position806, tokenIndex806 := position, tokenIndex
+						position808, tokenIndex808 := position, tokenIndex
 						if buffer[position] != rune('x') {
+							goto l809
+						}
+						position++
+						goto l808
+					l809:
+						position, tokenIndex = position808, tokenIndex808
+						if buffer[position] != rune('X') {
 							goto l807
 						}
 						position++
-						goto l806
-					l807:
-						position, tokenIndex = position806, tokenIndex806
-						if buffer[position] != rune('X') {
-							goto l805
-						}
-						position++
 					}
-				l806:
-					{
-						position810, tokenIndex810 := position, tokenIndex
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l811
-						}
-						position++
-						goto l810
-					l811:
-						position, tokenIndex = position810, tokenIndex810
-						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l812
-						}
-						position++
-						goto l810
-					l812:
-						position, tokenIndex = position810, tokenIndex810
-						{
-							position813, tokenIndex813 := position, tokenIndex
-							if c := buffer[position]; c < rune('a') || c > rune('f') {
-								goto l814
-							}
-							position++
-							goto l813
-						l814:
-							position, tokenIndex = position813, tokenIndex813
-							if c := buffer[position]; c < rune('A') || c > rune('F') {
-								goto l805
-							}
-							position++
-						}
-					l813:
-					}
-				l810:
 				l808:
 					{
-						position809, tokenIndex809 := position, tokenIndex
+						position812, tokenIndex812 := position, tokenIndex
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l813
+						}
+						position++
+						goto l812
+					l813:
+						position, tokenIndex = position812, tokenIndex812
+						if c := buffer[position]; c < rune('0') || c > rune('9') {
+							goto l814
+						}
+						position++
+						goto l812
+					l814:
+						position, tokenIndex = position812, tokenIndex812
 						{
 							position815, tokenIndex815 := position, tokenIndex
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
+							if c := buffer[position]; c < rune('a') || c > rune('f') {
 								goto l816
 							}
 							position++
 							goto l815
 						l816:
 							position, tokenIndex = position815, tokenIndex815
-							if c := buffer[position]; c < rune('0') || c > rune('9') {
-								goto l817
+							if c := buffer[position]; c < rune('A') || c > rune('F') {
+								goto l807
 							}
 							position++
-							goto l815
-						l817:
-							position, tokenIndex = position815, tokenIndex815
+						}
+					l815:
+					}
+				l812:
+				l810:
+					{
+						position811, tokenIndex811 := position, tokenIndex
+						{
+							position817, tokenIndex817 := position, tokenIndex
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l818
+							}
+							position++
+							goto l817
+						l818:
+							position, tokenIndex = position817, tokenIndex817
+							if c := buffer[position]; c < rune('0') || c > rune('9') {
+								goto l819
+							}
+							position++
+							goto l817
+						l819:
+							position, tokenIndex = position817, tokenIndex817
 							{
-								position818, tokenIndex818 := position, tokenIndex
+								position820, tokenIndex820 := position, tokenIndex
 								if c := buffer[position]; c < rune('a') || c > rune('f') {
-									goto l819
+									goto l821
 								}
 								position++
-								goto l818
-							l819:
-								position, tokenIndex = position818, tokenIndex818
+								goto l820
+							l821:
+								position, tokenIndex = position820, tokenIndex820
 								if c := buffer[position]; c < rune('A') || c > rune('F') {
-									goto l809
+									goto l811
 								}
 								position++
 							}
-						l818:
+						l820:
 						}
-					l815:
-						goto l808
-					l809:
-						position, tokenIndex = position809, tokenIndex809
+					l817:
+						goto l810
+					l811:
+						position, tokenIndex = position811, tokenIndex811
 					}
-					goto l795
-				l805:
-					position, tokenIndex = position795, tokenIndex795
+					goto l797
+				l807:
+					position, tokenIndex = position797, tokenIndex797
 					if c := buffer[position]; c < rune('0') || c > rune('9') {
-						goto l789
+						goto l791
 					}
 					position++
-				l820:
+				l822:
 					{
-						position821, tokenIndex821 := position, tokenIndex
+						position823, tokenIndex823 := position, tokenIndex
 						if c := buffer[position]; c < rune('0') || c > rune('9') {
-							goto l821
+							goto l823
 						}
 						position++
-						goto l820
-					l821:
-						position, tokenIndex = position821, tokenIndex821
+						goto l822
+					l823:
+						position, tokenIndex = position823, tokenIndex823
 					}
 				}
-			l795:
-				add(ruleOffset, position790)
+			l797:
+				add(ruleOffset, position792)
 			}
 			return true
-		l789:
-			position, tokenIndex = position789, tokenIndex789
+		l791:
+			position, tokenIndex = position791, tokenIndex791
 			return false
 		},
 		/* 52 Section <- <([a-z] / [A-Z] / '@')+> */
 		func() bool {
-			position822, tokenIndex822 := position, tokenIndex
+			position824, tokenIndex824 := position, tokenIndex
 			{
-				position823 := position
+				position825 := position
 				{
-					position826, tokenIndex826 := position, tokenIndex
+					position828, tokenIndex828 := position, tokenIndex
 					if c := buffer[position]; c < rune('a') || c > rune('z') {
-						goto l827
+						goto l829
 					}
 					position++
+					goto l828
+				l829:
+					position, tokenIndex = position828, tokenIndex828
+					if c := buffer[position]; c < rune('A') || c > rune('Z') {
+						goto l830
+					}
+					position++
+					goto l828
+				l830:
+					position, tokenIndex = position828, tokenIndex828
+					if buffer[position] != rune('@') {
+						goto l824
+					}
+					position++
+				}
+			l828:
+			l826:
+				{
+					position827, tokenIndex827 := position, tokenIndex
+					{
+						position831, tokenIndex831 := position, tokenIndex
+						if c := buffer[position]; c < rune('a') || c > rune('z') {
+							goto l832
+						}
+						position++
+						goto l831
+					l832:
+						position, tokenIndex = position831, tokenIndex831
+						if c := buffer[position]; c < rune('A') || c > rune('Z') {
+							goto l833
+						}
+						position++
+						goto l831
+					l833:
+						position, tokenIndex = position831, tokenIndex831
+						if buffer[position] != rune('@') {
+							goto l827
+						}
+						position++
+					}
+				l831:
 					goto l826
 				l827:
-					position, tokenIndex = position826, tokenIndex826
-					if c := buffer[position]; c < rune('A') || c > rune('Z') {
-						goto l828
-					}
-					position++
-					goto l826
-				l828:
-					position, tokenIndex = position826, tokenIndex826
-					if buffer[position] != rune('@') {
-						goto l822
-					}
-					position++
+					position, tokenIndex = position827, tokenIndex827
 				}
-			l826:
-			l824:
-				{
-					position825, tokenIndex825 := position, tokenIndex
-					{
-						position829, tokenIndex829 := position, tokenIndex
-						if c := buffer[position]; c < rune('a') || c > rune('z') {
-							goto l830
-						}
-						position++
-						goto l829
-					l830:
-						position, tokenIndex = position829, tokenIndex829
-						if c := buffer[position]; c < rune('A') || c > rune('Z') {
-							goto l831
-						}
-						position++
-						goto l829
-					l831:
-						position, tokenIndex = position829, tokenIndex829
-						if buffer[position] != rune('@') {
-							goto l825
-						}
-						position++
-					}
-				l829:
-					goto l824
-				l825:
-					position, tokenIndex = position825, tokenIndex825
-				}
-				add(ruleSection, position823)
+				add(ruleSection, position825)
 			}
 			return true
-		l822:
-			position, tokenIndex = position822, tokenIndex822
+		l824:
+			position, tokenIndex = position824, tokenIndex824
 			return false
 		},
 		/* 53 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */
 		func() bool {
-			position832, tokenIndex832 := position, tokenIndex
+			position834, tokenIndex834 := position, tokenIndex
 			{
-				position833 := position
+				position835 := position
 				if buffer[position] != rune('%') {
-					goto l832
+					goto l834
 				}
 				position++
 				{
-					position834, tokenIndex834 := position, tokenIndex
+					position836, tokenIndex836 := position, tokenIndex
 					if c := buffer[position]; c < rune('c') || c > rune('g') {
-						goto l835
+						goto l837
 					}
 					position++
-					goto l834
-				l835:
-					position, tokenIndex = position834, tokenIndex834
+					goto l836
+				l837:
+					position, tokenIndex = position836, tokenIndex836
 					if buffer[position] != rune('s') {
-						goto l832
+						goto l834
 					}
 					position++
 				}
-			l834:
+			l836:
 				if buffer[position] != rune('s') {
-					goto l832
+					goto l834
 				}
 				position++
 				if buffer[position] != rune(':') {
-					goto l832
+					goto l834
 				}
 				position++
-				add(ruleSegmentRegister, position833)
+				add(ruleSegmentRegister, position835)
 			}
 			return true
-		l832:
-			position, tokenIndex = position832, tokenIndex832
+		l834:
+			position, tokenIndex = position834, tokenIndex834
 			return false
 		},
 	}
diff --git a/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s b/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
index c54756b..7e48e27 100644
--- a/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
+++ b/src/util/fipstools/delocate/testdata/x86_64-Basic/in.s
@@ -47,3 +47,4 @@
 .L4: .L5:	movq %rbx, %rax # This is also legal.
 .size	foo, .-foo
 .type	foo, @function
+.uleb128 .foo-1-.bar
diff --git a/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s b/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
index 23e97c8..a55e852 100644
--- a/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
+++ b/src/util/fipstools/delocate/testdata/x86_64-Basic/out.s
@@ -55,6 +55,7 @@
 	movq %rbx, %rax # This is also legal.
 .size	foo, .-foo
 .type	foo, @function
+.uleb128 .foo-1-.bar
 .text
 .loc 2 2 0
 BORINGSSL_bcm_text_end:
diff --git a/src/util/fipstools/test-break-kat.sh b/src/util/fipstools/test-break-kat.sh
new file mode 100644
index 0000000..d2c44a7
--- /dev/null
+++ b/src/util/fipstools/test-break-kat.sh
@@ -0,0 +1,40 @@
+# Copyright (c) 2022, Google Inc.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+# This script attempts to break each of the known KATs and checks that doing so
+# seems to work and at least mentions the correct KAT in the output.
+
+set -x
+set -e
+
+TEST_FIPS_BIN="build/util/fipstools/test_fips"
+
+if [ ! -f $TEST_FIPS_BIN ]; then
+  echo "$TEST_FIPS_BIN is missing. Run this script from the top level of a"
+  echo "BoringSSL checkout and ensure that ./build-fips-break-test-binaries.sh"
+  echo "has been run first."
+  exit 1
+fi
+
+KATS=$(go run util/fipstools/break-kat.go --list-tests)
+
+for kat in $KATS; do
+  go run util/fipstools/break-kat.go $TEST_FIPS_BIN $kat > break-kat-bin
+  chmod u+x ./break-kat-bin
+  if ! (./break-kat-bin 2>&1 || true) | egrep -q "^$kat[^a-zA-Z0-9]"; then
+    echo "Failure for $kat did not mention that name in the output"
+    exit 1
+  fi
+  rm ./break-kat-bin
+done
diff --git a/src/util/fipstools/cavp/test_fips.c b/src/util/fipstools/test_fips.c
similarity index 98%
rename from src/util/fipstools/cavp/test_fips.c
rename to src/util/fipstools/test_fips.c
index dd82d65..b3d5521 100644
--- a/src/util/fipstools/cavp/test_fips.c
+++ b/src/util/fipstools/test_fips.c
@@ -30,9 +30,9 @@
 #include <openssl/rsa.h>
 #include <openssl/sha.h>
 
-#include "../crypto/fipsmodule/rand/internal.h"
-#include "../crypto/fipsmodule/tls/internal.h"
-#include "../crypto/internal.h"
+#include "../../crypto/fipsmodule/rand/internal.h"
+#include "../../crypto/fipsmodule/tls/internal.h"
+#include "../../crypto/internal.h"
 
 
 static void hexdump(const void *a, size_t len) {
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 5cc2de4..3263d9b 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -26,15 +26,15 @@
 # OS_ARCH_COMBOS maps from OS and platform to the OpenSSL assembly "style" for
 # that platform and the extension used by asm files.
 OS_ARCH_COMBOS = [
-    ('ios', 'arm', 'ios32', [], 'S'),
-    ('ios', 'aarch64', 'ios64', [], 'S'),
+    ('apple', 'arm', 'ios32', [], 'S'),
+    ('apple', 'aarch64', 'ios64', [], 'S'),
+    ('apple', 'x86', 'macosx', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
+    ('apple', 'x86_64', 'macosx', [], 'S'),
     ('linux', 'arm', 'linux32', [], 'S'),
     ('linux', 'aarch64', 'linux64', [], 'S'),
     ('linux', 'ppc64le', 'linux64le', [], 'S'),
     ('linux', 'x86', 'elf', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
     ('linux', 'x86_64', 'elf', [], 'S'),
-    ('mac', 'x86', 'macosx', ['-fPIC', '-DOPENSSL_IA32_SSE2'], 'S'),
-    ('mac', 'x86_64', 'macosx', [], 'S'),
     ('win', 'x86', 'win32n', ['-DOPENSSL_IA32_SSE2'], 'asm'),
     ('win', 'x86_64', 'nasm', [], 'asm'),
     ('win', 'aarch64', 'win64', [], 'S'),
@@ -586,12 +586,8 @@
             asm_files)
 
       cmake.write(
-R'''if(APPLE AND ARCH STREQUAL "aarch64")
-  set(CRYPTO_ARCH_SOURCES ${CRYPTO_ios_aarch64_SOURCES})
-elseif(APPLE AND ARCH STREQUAL "arm")
-  set(CRYPTO_ARCH_SOURCES ${CRYPTO_ios_arm_SOURCES})
-elseif(APPLE)
-  set(CRYPTO_ARCH_SOURCES ${CRYPTO_mac_${ARCH}_SOURCES})
+R'''if(APPLE)
+  set(CRYPTO_ARCH_SOURCES ${CRYPTO_apple_${ARCH}_SOURCES})
 elseif(UNIX)
   set(CRYPTO_ARCH_SOURCES ${CRYPTO_linux_${ARCH}_SOURCES})
 elseif(WIN32)
diff --git a/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm b/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
index 68c74cc..49be6f6 100644
--- a/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
+++ b/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm
@@ -31,6 +31,8 @@
 	mov	r9d,DWORD[r11]
 	mov	r10d,DWORD[4+r11]
 	mov	r11d,DWORD[8+r11]
+	test	r11d,536870912
+	jnz	NEAR $L$shaext_shortcut
 	and	r9d,1073741824
 	and	r10d,268435968
 	or	r10d,r9d
@@ -1795,6 +1797,240 @@
 DB	111,114,103,62,0
 
 ALIGN	64
+sha256_block_data_order_shaext:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_sha256_block_data_order_shaext:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+
+
+
+$L$shaext_shortcut:
+	lea	rsp,[((-88))+rsp]
+	movaps	XMMWORD[(-8-80)+rax],xmm6
+	movaps	XMMWORD[(-8-64)+rax],xmm7
+	movaps	XMMWORD[(-8-48)+rax],xmm8
+	movaps	XMMWORD[(-8-32)+rax],xmm9
+	movaps	XMMWORD[(-8-16)+rax],xmm10
+$L$prologue_shaext:
+	lea	rcx,[((K256+128))]
+	movdqu	xmm1,XMMWORD[rdi]
+	movdqu	xmm2,XMMWORD[16+rdi]
+	movdqa	xmm7,XMMWORD[((512-128))+rcx]
+
+	pshufd	xmm0,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	pshufd	xmm2,xmm2,0x1b
+	movdqa	xmm8,xmm7
+DB	102,15,58,15,202,8
+	punpcklqdq	xmm2,xmm0
+	jmp	NEAR $L$oop_shaext
+
+ALIGN	16
+$L$oop_shaext:
+	movdqu	xmm3,XMMWORD[rsi]
+	movdqu	xmm4,XMMWORD[16+rsi]
+	movdqu	xmm5,XMMWORD[32+rsi]
+DB	102,15,56,0,223
+	movdqu	xmm6,XMMWORD[48+rsi]
+
+	movdqa	xmm0,XMMWORD[((0-128))+rcx]
+	paddd	xmm0,xmm3
+DB	102,15,56,0,231
+	movdqa	xmm10,xmm2
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	nop
+	movdqa	xmm9,xmm1
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((32-128))+rcx]
+	paddd	xmm0,xmm4
+DB	102,15,56,0,239
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	lea	rsi,[64+rsi]
+DB	15,56,204,220
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((64-128))+rcx]
+	paddd	xmm0,xmm5
+DB	102,15,56,0,247
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((96-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((128-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((160-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+DB	15,56,204,220
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((192-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,205,245
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((224-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((256-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((288-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+	nop
+	paddd	xmm6,xmm7
+DB	15,56,204,220
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((320-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,205,245
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm6
+DB	102,15,58,15,253,4
+	nop
+	paddd	xmm3,xmm7
+DB	15,56,204,229
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((352-128))+rcx]
+	paddd	xmm0,xmm6
+DB	15,56,205,222
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm3
+DB	102,15,58,15,254,4
+	nop
+	paddd	xmm4,xmm7
+DB	15,56,204,238
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((384-128))+rcx]
+	paddd	xmm0,xmm3
+DB	15,56,205,227
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm4
+DB	102,15,58,15,251,4
+	nop
+	paddd	xmm5,xmm7
+DB	15,56,204,243
+DB	15,56,203,202
+	movdqa	xmm0,XMMWORD[((416-128))+rcx]
+	paddd	xmm0,xmm4
+DB	15,56,205,236
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	movdqa	xmm7,xmm5
+DB	102,15,58,15,252,4
+DB	15,56,203,202
+	paddd	xmm6,xmm7
+
+	movdqa	xmm0,XMMWORD[((448-128))+rcx]
+	paddd	xmm0,xmm5
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+DB	15,56,205,245
+	movdqa	xmm7,xmm8
+DB	15,56,203,202
+
+	movdqa	xmm0,XMMWORD[((480-128))+rcx]
+	paddd	xmm0,xmm6
+	nop
+DB	15,56,203,209
+	pshufd	xmm0,xmm0,0x0e
+	dec	rdx
+	nop
+DB	15,56,203,202
+
+	paddd	xmm2,xmm10
+	paddd	xmm1,xmm9
+	jnz	NEAR $L$oop_shaext
+
+	pshufd	xmm2,xmm2,0xb1
+	pshufd	xmm7,xmm1,0x1b
+	pshufd	xmm1,xmm1,0xb1
+	punpckhqdq	xmm1,xmm2
+DB	102,15,58,15,215,8
+
+	movdqu	XMMWORD[rdi],xmm1
+	movdqu	XMMWORD[16+rdi],xmm2
+	movaps	xmm6,XMMWORD[((-8-80))+rax]
+	movaps	xmm7,XMMWORD[((-8-64))+rax]
+	movaps	xmm8,XMMWORD[((-8-48))+rax]
+	movaps	xmm9,XMMWORD[((-8-32))+rax]
+	movaps	xmm10,XMMWORD[((-8-16))+rax]
+	mov	rsp,rax
+$L$epilogue_shaext:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+
+$L$SEH_end_sha256_block_data_order_shaext:
+
+ALIGN	64
 sha256_block_data_order_ssse3:
 	mov	QWORD[8+rsp],rdi	;WIN64 prologue
 	mov	QWORD[16+rsp],rsi
@@ -4115,11 +4351,46 @@
 	pop	rsi
 	DB	0F3h,0C3h		;repret
 
+
+ALIGN	16
+shaext_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	lea	r10,[$L$prologue_shaext]
+	cmp	rbx,r10
+	jb	NEAR $L$in_prologue
+
+	lea	r10,[$L$epilogue_shaext]
+	cmp	rbx,r10
+	jae	NEAR $L$in_prologue
+
+	lea	rsi,[((-8-80))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,10
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$in_prologue
+
 section	.pdata rdata align=4
 ALIGN	4
 	DD	$L$SEH_begin_sha256_block_data_order wrt ..imagebase
 	DD	$L$SEH_end_sha256_block_data_order wrt ..imagebase
 	DD	$L$SEH_info_sha256_block_data_order wrt ..imagebase
+	DD	$L$SEH_begin_sha256_block_data_order_shaext wrt ..imagebase
+	DD	$L$SEH_end_sha256_block_data_order_shaext wrt ..imagebase
+	DD	$L$SEH_info_sha256_block_data_order_shaext wrt ..imagebase
 	DD	$L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase
 	DD	$L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase
 	DD	$L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase
@@ -4132,6 +4403,9 @@
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase
 	DD	$L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase
+$L$SEH_info_sha256_block_data_order_shaext:
+DB	9,0,0,0
+	DD	shaext_handler wrt ..imagebase
 $L$SEH_info_sha256_block_data_order_ssse3:
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase